# Tweets sentiment analysis

### Importing the libraries

In [1]:
import os
import numpy as np
from nltk.tokenize import TweetTokenizer

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
import gensim

import matplotlib.pyplot as plt

#MANAGEMENT PURPOSES ONLY-
from tqdm.notebook import tqdm
import gc
gc.collect()

0

Set seed to ensure reproducibility

In [2]:
SEED = 32
np.random.seed(SEED)

### Load processed dataset

In [3]:
X = np.load('./data/X.npy', allow_pickle=True)
y = np.load('./data/y.npy')

In [4]:
X.shape

(420, 200)

In [5]:
X[0][0]

"#USER# I'll DM you my address."

In [6]:
y[0]

1

In [7]:
X = X.tolist()
y = y.tolist()

In [8]:
tweet_tokenizer = TweetTokenizer()
model = Word2Vec(sentences=common_texts, vector_size=200, window=5, min_count=1, workers=4)

In [9]:
all_words = []
all_tweets = []

for profile in tqdm(X):
    for tweet in profile:
        tk = tweet_tokenizer.tokenize(tweet)
        all_words.extend(tk)
        all_tweets.append(tk)
        

  0%|          | 0/420 [00:00<?, ?it/s]

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=200, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(all_words)
tfidf.shape

(2598632, 200)

In [11]:
model_w2v = Word2Vec(
        all_tweets,
        vector_size=200, # desired no. of features/independent variables
        window=5, # context window size
        min_count=0,                           
        sg = 1, # 1 for skip-gram model
        hs = 0,
        negative = 10, # for negative sampling
        workers= 8, # no.of cores
        seed = SEED
) 

In [12]:
model_w2v.train(all_tweets, total_examples=200*len(X), epochs=20)

(38221528, 51972640)

In [17]:
def encode_profile(profile):
    
    data = []
    # iterate through each sentence in the file
    for tweet in profile:
        # tokenize the sentence
        tokenized_tweet = tweet_tokenizer.tokenize(tweet)
        vectorized_tweet = np.zeros(200)
        for word in tokenized_tweet:
            try:
                vectorized_tweet = vectorized_tweet + model_w2v.wv[word]
            except:
                vectorized_tweet = vectorized_tweet + model_w2v.wv["unknown"]
        data.append(vectorized_tweet)
            
    return np.asarray(data)

In [18]:
example_profile = X[0]

In [19]:
e=encode_profile(example_profile)
print(e.shape)

(200, 200)


In [20]:
encoded_profiles = {}
for i in tqdm(range(len(X)), desc='Encoding Profiles'):
    encoded_profiles[i] = np.sum(encode_profile(X[i]), axis=0)

Encoding Profiles:   0%|          | 0/420 [00:00<?, ?it/s]

In [21]:
len(encoded_profiles)

420

In [22]:
encoded_profiles[0].shape

(200,)

In [23]:
encoded_profiles[0]

array([  139.03746771, -1596.51716625,   -55.03529726,   754.14288216,
         935.63348194,   -94.13947379, -1935.04442024,   958.6061436 ,
        -720.58581398,  1882.79790292,   947.31115815,  -844.87213206,
         454.2546937 ,   332.91593077,  1422.80350982,  -941.87401033,
        1587.40539863,   774.00476438,    18.53832218,   181.44185229,
         375.54564469,   572.59886276,  2989.22098782,  -391.80501315,
         303.62366889,  -728.29402967,   199.41915773,  -190.37461662,
        -193.53695471,   990.82347056,  -364.65634639,   709.0129572 ,
       -1351.09331947,   777.46250006,  -689.34142165,  -491.33000694,
        -130.1511354 ,  -778.28955836,   436.76864026,   838.39330007,
         -44.24340629,   381.59901401,  -930.09838427,  1171.55345205,
        -767.88138949,   414.47983796,   -43.90499798,  1247.58215985,
        1165.56223529,   555.89849656,  1395.92647382,  1156.0308108 ,
        1242.95623797,   603.70891176, -1280.79894063, -1799.28428555,
      

In [24]:
output_filepath = os.path.join('data', 'profile_vectors_word2vec')

In [26]:
for i in tqdm(range(len(X)), desc='Saving Profiles'):
    np.save(os.path.join(output_filepath, str(i)+".npy"), encoded_profiles[i])


Saving Profiles:   0%|          | 0/420 [00:00<?, ?it/s]