# Tweets sentiment analysis

### Importing the libraries

In [3]:
import os
import numpy as np
from nltk.tokenize import TweetTokenizer

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
import gensim

import matplotlib.pyplot as plt

#MANAGEMENT PURPOSES ONLY-
from tqdm.notebook import tqdm
import gc
gc.collect()

ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 88 from C header, got 80 from PyObject

Set seed to ensure reproducibility

In [None]:
SEED = 32
np.random.seed(SEED)

### Load processed dataset

In [None]:
X = np.load('./data/X.npy', allow_pickle=True)
y = np.load('./data/y.npy')

In [None]:
X.shape

In [None]:
X[0][0]

In [None]:
y[0]

In [None]:
X = X.tolist()
y = y.tolist()

In [None]:
tweet_tokenizer = TweetTokenizer()
model = Word2Vec(sentences=common_texts, vector_size=200, window=5, min_count=1, workers=4)

In [None]:
all_words = []
all_tweets = []

for profile in tqdm(X):
    for tweet in profile:
        tk = tweet_tokenizer.tokenize(tweet)
        all_words.extend(tk)
        all_tweets.append(tk)
        

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=200, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(all_words)
tfidf.shape

In [None]:
model_w2v = Word2Vec(
        all_tweets,
        vector_size=200, # desired no. of features/independent variables
        window=5, # context window size
        min_count=0,                           
        sg = 1, # 1 for skip-gram model
        hs = 0,
        negative = 10, # for negative sampling
        workers= 8, # no.of cores
        seed = SEED
) 

In [None]:
model_w2v.train(all_tweets, total_examples=200*len(X), epochs=20)

In [None]:
def encode_profile(profile):
    
    data = []
    # iterate through each sentence in the file
    for tweet in profile:
        # tokenize the sentence
        tokenized_tweet = tweet_tokenizer.tokenize(tweet)
        vectorized_tweet = np.zeros(200)
        for word in tokenized_tweet:
            try:
                vectorized_tweet = vectorized_tweet + model_w2v.wv[word]
            except:
                vectorized_tweet = vectorized_tweet + model_w2v.wv["unknown"]
        data.append(vectorized_tweet)
            
    return np.asarray(data)

In [None]:
example_profile = X[0]

In [None]:
e=encode_profile(example_profile)
print(e.shape)

In [None]:
encoded_profiles = {}
for i in tqdm(range(len(X)), desc='Encoding Profiles'):
    encoded_profiles[i] = np.sum(encode_profile(X[i]), axis=0)

In [None]:
len(encoded_profiles)

In [None]:
encoded_profiles[0].shape

In [None]:
output_filepath = os.path.join('data', 'profile_matrices_word2vec')

In [None]:
for i in tqdm(range(len(X)), desc='Saving Profiles'):
    np.save(os.path.join(output_filepath, str(i)+".npy"), encoded_profiles[i])


In [None]:
plt.imshow(encoded_profiles[0]-encoded_profiles[8])

In [None]:
plt.imshow(encoded_profiles[80])