# Installing a tweet preprocessor

In [None]:

!pip install tweet-preprocessor  

# Importing the python libraries

In [None]:
!pip install gensim --upgrade
!pip install pyldavis
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import numpy as np
import itertools
from gensim.models import Word2Vec
from wordcloud import WordCloud
from google.colab import drive
from nltk.corpus import stopwords
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from textblob import TextBlob                         
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate

# Importing the python libraries  to read the csv file from the drive

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import tensorflow as tf
import preprocessor as p
drive.mount('/content/drive')   #mounting the drive
path = '/content/drive/MyDrive/Tweet_Covid19.csv'   
df = pd.read_csv(path)  #reading the csv file where our tweet data is saved

In [None]:
#displaying the first few rows of dataset
df.head()

# Data Cleaning

In [None]:

#here we are cleaing the text data and checking if there are any unwanted data
#before we generate the sentiment from the text

def preprocess_tweet(row):
    tweet = row['text']
    tweet = p.clean(text)
    return tweet

# Importing TextBlob library for the sentiment analysis on text

In [None]:
from textblob import TextBlob
df['polarity'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['subjectivity'] = df['text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [None]:
#getting analysis by getAnalysis method 

from textblob import TextBlob
def getAnalysis(score):
   if score < 0:
    return 'Negative'
   elif score == 0:
     return 'Neutral'
   else:
     return 'Positive'
df['sentiment'] = df['polarity'].apply(getAnalysis)

In [None]:
#displaying the first few rows of the dataset after the sentiment analysis on text
df.head()

In [None]:
#listing all the columns present in a dataset
list(df.columns)

In [None]:
#checking for null values and dropping them
#displaying the sentiments using histogram
df.isnull().values.any()
df = df.dropna()
df['sentiment'].hist(bins=10)

In [None]:
#removing stop words, punctuations and converting into lowercase 
#Keep only lowercase characters from the text column to remove noise, removing links
df['text'] = df['text'].replace('[^a-zA-Z]', ' ',regex=True).str.lower()
df['text'] = df['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
nltk.download('stopwords')
nltk.download('punkt')
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['text']

In [None]:
#Finding most common words from the corpus and common words for each sentiment
#Using word cloud we are graphically representing the word frequency for each sentiment.
df['sentiment'].hist(bins=10)
# Globally
total_wordcloud = WordCloud().generate(' '.join(df['text']))
plt.imshow(total_wordcloud), plt.title('Most common words')
plt.show()
# Per sentiment
pos_df = df[df['sentiment'] == 'Positive']
neu_df = df[df['sentiment'] == 'Neutral']
neg_df = df[df['sentiment'] == 'Negative']

pos_wordcloud = WordCloud().generate(' '.join(pos_df['text']))
neu_wordcloud = WordCloud().generate(' '.join(neu_df['text']))
neg_wordcloud = WordCloud().generate(' '.join(neg_df['text']))

plt.imshow(pos_wordcloud), plt.title('Most common words (positive sentiment)'),plt.show()
plt.imshow(neu_wordcloud), plt.title('Most common words (neutral sentiment)'),plt.show()
plt.imshow(neg_wordcloud), plt.title('Most common words (negative sentiment)'),plt.show()


In [None]:
#comparing the snetiments between diefferent vaccines that are occuring in the source text(astrazeneca-pfizer-moderna)
astra_df = df[df['text'].str.contains('astrazeneca')]
moderna_df = df[df['text'].str.contains('moderna')]
pfizer_df = df[df['text'].str.contains('pfizer') | df['text'].str.contains('biontech')]

plt.hist(astra_df['sentiment']), plt.title('AstraZeneca sentiments'), plt.show()
plt.hist(moderna_df['sentiment']), plt.title('Moderna sentiments'), plt.show()
plt.hist(pfizer_df['sentiment']), plt.title('Pfizer sentiments'), plt.show()

In [None]:
#checking average number of user friends per sentiment.
pos_friends = df[df['sentiment']=='Positive']['user_friends'].astype(int)
neg_friends = df[df['sentiment']=='Negative']['user_friends'].astype(int)
pos_friends.mean(), neg_friends.mean()



In [None]:
#Subsampling the dataframe and checking the performance
print(len(df[df['sentiment'] == 'Neutral']), len(df[df['sentiment'] == 'Positive']), len(df[df['sentiment'] == 'Negative']))
df = df.sample(frac=1)
subsampled_df = df[df['sentiment'] == 'Neutral'][:int(0.15*len(df))]
print('Subsampled dataframe with neutral sentiments length: ', len(subsampled_df))

# Creating Train and Test Dataset on Sentiments

In [None]:
#creating the train/test dataset on negative and positive sentiment and
#concating the two data frames subsamples and sentiment data frames.

combined_df = pd.concat([subsampled_df,df[df['sentiment'] == 'Negative'], df[df['sentiment'] == 'Positive']])
combined_df = combined_df[['text', 'sentiment']]
X_train, X_test, y_train, y_test = train_test_split(combined_df['text'], combined_df['sentiment'], test_size=0.2)


# Vectorization: Bag of words/Tf-idf

In [None]:
bow_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

X_test_bow = bow_vectorizer.transform(X_test)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Word2Vec 

In [None]:

# Create Word2Vec representation.
print('Training Word2Vec model (after tokenization)...')
tokenized_X_train = [nltk.word_tokenize(tweet) for tweet in X_train]
tokenized_X_test = [nltk.word_tokenize(tweet) for tweet in X_test]
# Learn word vectors from the corpus, dimensionality is 100
model = Word2Vec(tokenized_X_train, vector_size=100, window=5, min_count=5, workers=4)
model.train(tokenized_X_train, total_examples=len(tokenized_X_train), epochs=5)
X_train_w2v = []
print('Transforming train tweets to w2v representation...')

for tweet in tokenized_X_train:
    if len(tweet) > 0:
        text = [word for word in tweet if word in model.wv.key_to_index]
    else:
        text = ['empty']
    # Take the average of each vector
    w2v_tweet = np.mean(model.wv[text], axis=0)
    X_train_w2v.append(w2v_tweet)
  
# Sanity check and conversion to numpy array
print('Processed this number of tweets: ', len(X_train_w2v))
X_train_w2v = np.array(X_train_w2v)
print('Train corpus shape after word2vec conversion', X_train_w2v.shape)
# Also transform the test set for usage later on
X_test_w2v = []
for tweet in tokenized_X_test:
    if len(tweet) > 0:
        text = [word for word in tweet if word in model.wv.key_to_index]
    else:
        text = ['empty']
    # Take the average of each vector
    w2v_news = np.mean(model.wv[text], axis=0)
    X_test_w2v.append(w2v_news)

# Sanity check and conversion to numpy array
print('Processed this number of tweets: ', len(X_test_w2v))
X_test_w2v = np.array(X_test_w2v)
print('Test corpus shape after word2vec conversion', X_test_w2v.shape)

LDA topic modelling: Lemmatization -> stemming -> bag of works (tokenization and stopword removal have already been performed

In [None]:

print(tokenized_X_train[0])

# Classification with SVM, RandomForests and KNN

In [None]:

scoring = {'Accuracy': 'accuracy', 'Precision': 'precision_macro', 'Recall': 'recall_macro',
           'F-Measure': 'f1_macro'}
n_jobs = -1

def train_evaluate_classifier(corpus, sentiments, clf):
    if clf == 'svm':
        # Train SVM and evaluate with 10fold
        # Dual = False helps speed up the process
        print('Training SVM classifier...')
        svm_clf = LinearSVC(dual=False)
        svm_score = cross_validate(svm_clf, corpus, sentiments, cv=10, scoring=scoring, n_jobs=n_jobs, verbose=10)
        return svm_score
    elif clf == 'random_forest':
        print('Training Random Forest Classifier...')
        forest_clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=n_jobs, verbose=10)
        forest_score = cross_validate(forest_clf, corpus, sentiments, cv=10, scoring=scoring, n_jobs=n_jobs, verbose=10)
        return forest_score
    elif clf == 'knn':
        # Custom method using ridge classifier
        # After multiple tests, this turned out to be the most successful one metrics-wise
        # Some preprocessing is also done here, by using stop words to remove irrelevant words from the vocabulary
        print('Training KNN classifier')
        knn_clf = KNeighborsClassifier(n_jobs=n_jobs)
        knn_clf.fit(corpus, sentiments)
        knn_clf_score = cross_validate(knn_clf, corpus, sentiments, cv=10, scoring=scoring,
                                         n_jobs=n_jobs,
                                         verbose=10)
        return knn_clf_score

def format_results(score_list):
    results = []
    for clf_score in score_list:
        clf_results = {'Accuracy': float("{0:.4f}".format(np.mean(clf_score['test_Accuracy']))),
                       'Precision': float("{0:.4f}".format(np.mean(clf_score['test_Precision']))),
                       'Recall': float("{0:.4f}".format(np.mean(clf_score['test_Recall']))),
                       'F-Measure': float("{0:.4f}".format(np.mean(clf_score['test_F-Measure'])))}
        results.append(clf_results)
    return results

def predict(corpus, clf):
    print('Predicting on test set...')
    predictions = clf.predict(corpus)
    return predictions

classifiers = ['svm', 'random_forest', 'knn']
corpus_dict = {'bow': X_train_bow, 'tfidf': X_train_tfidf, 'w2v': X_train_w2v}
# Train, evaluate classifiers and format results properly
scores = []
combinations = list(itertools.product(corpus_dict.values(), classifiers))
for current_corpus, classifier in combinations:
    scores.append(train_evaluate_classifier(current_corpus, y_train, classifier))



formatted_scores = format_results(score_list=scores)

print('Results from 10fold cross-validation on the training set')
result_combinations = ['SVM-bow', 'SVM-tfidf', 'SVM-w2v', 'Random Forest-bow', 'Random Forest-tfidf', 'Random Forest w2v',
                       'KNN-bow', 'KNN-tfidf', 'KNN-w2v']

for result_combination, current_result in zip(result_combinations, formatted_scores):
    print(result_combination, current_result)