In [54]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vanis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vanis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vanis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Define column names
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]

# Read the JSON file into a pandas DataFrame
data = pd.read_json('C:\Personal\Minty\MIT\Sem 6\DS\detecting_suspicious_activity\sentiment140.json', lines=True)

# Select the desired columns
X = data[['text']]
Y = data['target']
Y[Y == 4] = 1

print("Data successfully loaded from JSON file.")


In [None]:
data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,2009-04-06 22:19:45,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,2009-04-06 22:19:49,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,2009-04-06 22:19:53,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,2009-04-06 22:19:57,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,2009-04-06 22:19:57,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
data.shape

(1600000, 6)

In [4]:
# Text-preprocessing

# Missing Values
num_missing_desc = data.isnull().sum()[2]    # No. of values with msising descriptions
print('Number of missing values: ' + str(num_missing_desc))
data = data.dropna()

TAG_CLEANING_RE = "@\S+"
# Remove @tags
X['text'] = X['text'].map(lambda x: re.sub(TAG_CLEANING_RE, ' ', x))

# Smart lowercase
X['text'] = X['text'].map(lambda x: x.lower())

# Remove numbers
X['text'] = X['text'].map(lambda x: re.sub(r'\d+', ' ', x))

# Remove links
TEXT_CLEANING_RE = "https?:\S+|http?:\S|[^A-Za-z0-9]+"
X['text'] = X['text'].map(lambda x: re.sub(TEXT_CLEANING_RE, ' ', x))

# Remove Punctuation
X['text']  = X['text'].map(lambda x: x.translate(x.maketrans('', '', string.punctuation)))

# Remove white spaces
X['text'] = X['text'].map(lambda x: x.strip())

# Tokenize into words
X['text'] = X['text'].map(lambda x: word_tokenize(x))

# Remove non alphabetic tokens
X['text'] = X['text'].map(lambda x: [word for word in x if word.isalpha()])

# Filter out stop words
stop_words = set(stopwords.words('english'))
X['text'] = X['text'].map(lambda x: [w for w in x if not w in stop_words])

# Word Lemmatization
lem = WordNetLemmatizer()
X['text'] = X['text'].map(lambda x: [lem.lemmatize(word,"v") for word in x])

# Turn lists back to string
X['text'] = X['text'].map(lambda x: ' '.join(x))

Number of missing values: 0


In [5]:
X.head()

Unnamed: 0,text
0,zl awww bummer shoulda get david carr third day
1,upset update facebook texting might cry result...
2,dive many time ball manage save rest go bound
3,whole body feel itchy like fire
4,behave mad see


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("TRAIN size:", len(X_train))
print("TEST size:", len(X_train))

TRAIN size: 1280000
TEST size: 1280000


## Flow

- Pre-processing
- create word2vec
- tokenize and padding
- model creation
- testing

In [7]:
# Word2Vec
import gensim

# WORD2VEC
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

documents = [_text.split() for _text in X_train.text]
w2v_model = gensim.models.word2vec.Word2Vec(vector_size=W2V_SIZE,
                                            window=W2V_WINDOW,
                                            min_count=W2V_MIN_COUNT,
                                            workers=8)
w2v_model.build_vocab(documents)

In [8]:
words = list(w2v_model.wv.index_to_key)
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 25276


In [9]:
# Train Word Embeddings
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

(251374387, 289225472)

In [10]:
#Test word embeddings
w2v_model.wv.most_similar("hey")

[('hi', 0.6796688437461853),
 ('heyy', 0.6195502281188965),
 ('hiya', 0.48511019349098206),
 ('heyyy', 0.479031503200531),
 ('heey', 0.4647592604160309),
 ('hii', 0.4566982686519623),
 ('u', 0.4523613750934601),
 ('hello', 0.4358495771884918),
 ('heya', 0.43406954407691956),
 ('heeey', 0.4340269863605499)]

In [11]:
# Tokenizing
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
#from keras.utils.np_utils import to_categorical

# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 300
# This is fixed.
EMBEDDING_DIM = 300

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train.text)
word_index = tokenizer.word_index
vocab_size = len(word_index)
print('Found %s unique tokens.' % len(word_index))

# Convert the data to padded sequences
X_train_padded = tokenizer.texts_to_sequences(X_train.text)
X_train_padded = pad_sequences(X_train_padded, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_train_padded.shape)

Found 232840 unique tokens.
Shape of data tensor: (1280000, 300)


In [12]:
# saving
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
# Embedding matrix for the embedding layer
embedding_matrix = np.zeros((vocab_size+1, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

(232841, 300)


In [14]:
# Build Model
import keras

model = Sequential()
model.add(Embedding(vocab_size+1, W2V_SIZE, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 300)          69852300  
                                                                 
 dropout (Dropout)           (None, 300, 300)          0         
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 70012801 (267.08 MB)
Trainable params: 160501 (626.96 KB)
Non-trainable params: 69852300 (266.47 MB)
_________________________________________________________________


In [None]:
#training
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
import pickle

# Adjusted Callbacks
callbacks = [
    ReduceLROnPlateau(monitor='val_loss', patience=3, cooldown=1),  # Reduced patience and added cooldown
    EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=3)  # Reduced patience
]

# Adjusted Batch Size
BATCH_SIZE = 256  # Reduced batch size

EPOCHS = 5  # Kept number of epochs

history = model.fit(
    X_train_padded,
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
    verbose=1,
    callbacks=callbacks
)

# Save Model and History
model.save('/content/drive/My Drive/Sentiment_LSTM_model.h5')
with open('/content/drive/My Drive/trainHistoryDict', 'wb') as file_pi:
    pickle.dump(history.history, file_pi)


Epoch 1/5



Epoch 2/5



Epoch 3/5



Epoch 4/5



Epoch 5/5





  saving_api.save_model(


In [15]:
# Load Model
from keras.models import load_model
import pickle
model = load_model('/content/drive/My Drive/detecting_suspicious_activity/Sentiment_LSTM_model.h5')
# loading tokenizer
with open('/content/drive/My Drive/detecting_suspicious_activity/trainHistoryDict', 'rb') as file_pi:
    history = pickle.load(file_pi)



In [None]:
# Evaluation
import matplotlib.pyplot as plt
X_test_padded = tokenizer.texts_to_sequences(X_test.text)
X_test_padded = pad_sequences(X_test_padded, maxlen=MAX_SEQUENCE_LENGTH)
score = model.evaluate(X_test_padded, y_test, batch_size=512)
print("ACCURACY:",score[1])
print("LOSS:",score[0])

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [16]:
def predict(text, include_neutral=True):
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=MAX_SEQUENCE_LENGTH)
    # Predict
    score = model.predict([x_test])[0]
    if(score >=0.4 and score<=0.6):
        label = "Neutral"
    if(score <=0.4):
        label = "Negative"
    if(score >=0.6):
        label = "Positive"

    return {"label" : label,
        "score": float(score)}

In [45]:
! pip install hdfs



In [55]:
from hdfs import InsecureClient

client = InsecureClient('http://localhost:9870', user='vanis')

client.download('/twitter_data/twitter_data.json', '/content/drive/My Drive/detecting_suspicious_activity/twitter_data.json')


ConnectionError: HTTPConnectionPool(host='localhost', port=9870): Max retries exceeded with url: /webhdfs/v1/twitter_data/twitter_data.json?user.name=vanis&op=GETFILESTATUS (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fb108795600>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [56]:
import json

with open('/content/drive/My Drive/detecting_suspicious_activity/twitter_data.json','r') as reader:
    content = json.load(reader)

tweet = []

for i in content:
  tweet.append(i['Tweet'])

In [59]:
predict("Sometimes I wish I could just erase all my memories :)")



  "score": float(score)}


{'label': 'Negative', 'score': 0.11455610394477844}

In [61]:
prediction = "/content/drive/My Drive/detecting_suspicious_activity/prediction_data.json"
for i in tweet:
  l = predict(i)
  with open(prediction, 'a') as file:
    json.dump(l, file, indent=4)




  "score": float(score)}




In [None]:
!apt update
!apt install chromium-chromedriver

In [None]:
! pip install webdriver-manager
! pip install selenium

In [None]:
!pip install chromedriver-binary selenium

In [44]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
from hdfs import InsecureClient
import json
import os

# Load Twitter credentials
#with open('/content/drive/My Drive/detecting_suspicious_activity/twitter_keys.json', 'r') as file:
    #data = json.load(file)

uname = "analysispro1234"
passwd = "analysispro@986"

# Initialize HDFS client
client = InsecureClient('http://localhost:9870', user='vanis')

# Path to chromedriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)
driver.set_window_size(1080, 800)
driver.get("https://twitter.com/login")

# Twitter search parameters
subject = "@SheroneDSouza"
start_date = "2024-01-01T00:00:00Z"
end_date = "2024-02-01T00:00:00Z"

# Login to Twitter
sleep(3)
username = driver.find_element(By.XPATH, "//input[@name='text']")
username.send_keys(uname)
next_button = driver.find_element(By.XPATH, "//span[contains(text(),'Next')]")
next_button.click()

sleep(3)
password = driver.find_element(By.XPATH, "//input[@name='password']")
password.send_keys(passwd)
log_in = driver.find_element(By.XPATH, "//span[contains(text(),'Log in')]")
log_in.click()

# Search for the subject
sleep(3)
#explore = driver.find_element(By.CSS_SELECTOR, ".css-175oi2r:nth-of-type(2)")
#explore.click()
sleep(3)
search_box = driver.find_element(By.XPATH, '//input[@data-testid="SearchBox_Search_Input"]')
search_box.send_keys(subject)
search_box.send_keys(Keys.ENTER)

sleep(5)

first_account = driver.find_element(By.CSS_SELECTOR, ".css-175oi2r.r-1awozwy.r-18u37iz.r-1wtj0ep")
first_account.click()

# Initialize lists to store data
UserTags = []
Tweets = []
Replys = []
retweets = []
likes = []

articles = driver.find_elements(By.XPATH, "//article[@data-testid='tweet']")
while True:
    for article in articles:
        try:
            UserTag = article.find_element(By.CSS_SELECTOR, "span.css-1qaijid.r-bcqeeo.r-qvutc0.r-poiln3").text
            UserTags.append(UserTag)

            Tweet = article.find_element(By.CSS_SELECTOR, "div[data-testid='tweetText']").text
            Tweets.append(Tweet)

            Reply = article.find_element(By.CSS_SELECTOR, "span[data-testid='app-text-transition-container']").text
            Replys.append(Reply)

            retweet = article.find_element(By.CSS_SELECTOR, "div[data-testid='retweet']").text
            retweets.append(retweet)

            like = article.find_element(By.CSS_SELECTOR, "div[data-testid='like']").text
            likes.append(like)

        except:
            continue

    driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
    sleep(3)
    articles = driver.find_elements(By.XPATH, "//article[@data-testid='tweet']")
    Tweets2 = list(set(Tweets))
    if len(Tweets2) > 25:
        break

    if len(UserTags) >= 25:
        break

# Combine lists into a list of dictionaries
tweet_data = []
for i in range(len(UserTags)):
    tweet_dict = {
        'UserTag': UserTags[i],
        'Tweet': Tweets[i],
        'Reply': Replys[i],
        'Retweet': retweets[i],
        'Likes': likes[i]
    }
    tweet_data.append(tweet_dict)

# Define file paths
json_file = os.path.join(os.getcwd(), 'twitter_data.json')
hdfs_file_path = '/twitter_data/twitter_data.json'

# Save data to JSON file with error handling
try:
    with open(json_file, mode='w', encoding='utf-8') as file:
        json.dump(tweet_data, file, ensure_ascii=False, indent=4)
    print(f"Data saved to {json_file}")
except Exception as e:
    print(f"Error saving data to {json_file}: {str(e)}")

driver.quit()

# Check if file exists in HDFS and delete if it does
try:
    if client.status(hdfs_file_path, strict=False):
        client.delete(hdfs_file_path)
        print(f"Existing file {hdfs_file_path} deleted.")
except Exception as err:
    if 'not a directory' in str(err):
        print(f"Error: {err}")
    else:
        raise err

# Upload new file to HDFS
try:
    client.upload(hdfs_file_path, json_file)
    print(f"File {json_file} uploaded to {hdfs_file_path}.")
except Exception as err:
    print(f"Error uploading file to HDFS: {err}")


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//input[@data-testid="SearchBox_Search_Input"]"}
  (Session info: chrome-headless-shell=123.0.6312.105); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
#0 0x5d4f4f622873 <unknown>
#1 0x5d4f4f3188c6 <unknown>
#2 0x5d4f4f363618 <unknown>
#3 0x5d4f4f3636d1 <unknown>
#4 0x5d4f4f3a6744 <unknown>
#5 0x5d4f4f3855cd <unknown>
#6 0x5d4f4f3a3c19 <unknown>
#7 0x5d4f4f385343 <unknown>
#8 0x5d4f4f356593 <unknown>
#9 0x5d4f4f356f5e <unknown>
#10 0x5d4f4f5e685b <unknown>
#11 0x5d4f4f5ea7b5 <unknown>
#12 0x5d4f4f5d4581 <unknown>
#13 0x5d4f4f5eb342 <unknown>
#14 0x5d4f4f5b988f <unknown>
#15 0x5d4f4f611738 <unknown>
#16 0x5d4f4f61190b <unknown>
#17 0x5d4f4f6219c4 <unknown>
#18 0x7b5ccfec0ac3 <unknown>
