In [34]:
import nltk
import pandas as pd
import re, string
import os
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer

#data loading
data = pd.ExcelFile('C:/Users/utsav/OneDrive/UIC/Fall_2023/CS_583/Project/training-Obama-Romney-tweets.xlsx')
obama = pd.read_excel(data, 'Obama')
#data cleaning
obama = obama[1:]
obama = obama.drop(['Unnamed: 0', 'date', 'time', 'Unnamed: 5'], axis=1)
obama = obama.rename(columns={'Unnamed: 4': 'class', 'Anootated tweet': 'tweet'})
obama = obama[obama['class'] != 2]
obama = obama.dropna()
obama['sentiment'] = obama['class'].apply(lambda x: 'positive' if x == 1 else 'neutral' if x==0 else 'negative')
obama = obama.drop(['class'], axis=1)

In [35]:
def clean(text):
    text = text.lower()
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)
    text = re.sub(r'www.[^ ]+', '', text)
    text = re.sub(r'[^a-z]', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text

def tokenize(text):
    stop_words = stopwords.words('english')
    text = clean(text)
    text = nltk.word_tokenize(text)
    text = [w for w in text if w not in stop_words]

    return text

In [36]:
obama['tweet_token'] = obama['tweet'].apply(lambda stext: tokenize(str(stext)))

#remove words with length less than 2
obama['tweet_string'] = obama['tweet_token'].apply(lambda x:' '.join([item for item in x if len(item)>2]))

all_words = ' '.join([text for text in obama['tweet_string']])
tokenized_obama = nltk.tokenize.word_tokenize(all_words)
fdist = FreqDist(tokenized_obama)
obama['tweet_string_fdist'] = obama['tweet_token'].apply(lambda x: ' '.join([item for item in x if fdist[item] >= 1 ]))
#fdist
#lemmatize
wordnet_lem = WordNetLemmatizer()
obama['tweet'] = obama['tweet_string_fdist'].apply(wordnet_lem.lemmatize)

obama = obama.drop(['tweet_token', 'tweet_string', 'tweet_string_fdist'], axis=1)
print(obama.shape)
obama.head(5)


(5718, 2)


Unnamed: 0,tweet,sentiment
1,kirkpatrick wore baseball cap embroidered bara...,neutral
3,obama debates cracker ass cracker tonight tuned,positive
5,youre missing point afraid understand bigger p...,neutral
7,raised democrat left party years ago lifetime ...,negative
8,obama camp afford lower expectations tonight d...,neutral


In [37]:
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

## NLTK VADER

In [38]:
from nltk.sentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
obama['polarity'] = obama['tweet'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
obama['predicted'] = obama['polarity'].apply(lambda x: 'positive' if x > 0 else 'neutral' if x==0 else 'negative')

In [39]:
print(classification_report(obama['sentiment'],obama['predicted']))

              precision    recall  f1-score   support

    negative       0.58      0.49      0.53      2170
     neutral       0.42      0.36      0.39      1895
    positive       0.37      0.51      0.43      1653

    accuracy                           0.45      5718
   macro avg       0.46      0.45      0.45      5718
weighted avg       0.47      0.45      0.46      5718



In [40]:
df_X = obama['tweet']
df_Y = obama['sentiment']
X_train, X_test, y_train, y_test = train_test_split(df_X,df_Y , test_size=0.2)
X_train.head(5)

3448    according polls undecided voters barack obama ...
368                 president obama best choice americans
258     shocked president obama lied rev wright disapp...
3103                                            obama win
3279                   obama record promised failed hurts
Name: tweet, dtype: object

### TF-IDF

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features=10000)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

## Logistic Regression

### fitting the model

In [42]:
lr_model = LogisticRegression(solver='liblinear',C=10,penalty='l2',random_state=4) #4=57%
lr_model.fit(X_train_vectors_tfidf, y_train)

#### Predicting

In [43]:
lr_y_pred = lr_model.predict(X_test_vectors_tfidf)
print(classification_report(y_test,lr_y_pred))

              precision    recall  f1-score   support

    negative       0.60      0.63      0.61       441
     neutral       0.53      0.45      0.49       367
    positive       0.56      0.60      0.58       336

    accuracy                           0.57      1144
   macro avg       0.56      0.56      0.56      1144
weighted avg       0.56      0.57      0.56      1144



## Naive Bayes

### fitting the model

In [44]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train_vectors_tfidf, y_train)

### predicting

In [45]:
nb_y_pred = nb_model.predict(X_test_vectors_tfidf)
print(classification_report(y_test,nb_y_pred))

              precision    recall  f1-score   support

    negative       0.54      0.78      0.64       441
     neutral       0.55      0.34      0.42       367
    positive       0.61      0.51      0.56       336

    accuracy                           0.56      1144
   macro avg       0.57      0.54      0.54      1144
weighted avg       0.56      0.56      0.54      1144



## SVM

### Fitting the model

In [46]:
from sklearn import model_selection, svm

svm_model = svm.SVC(kernel='linear', random_state=4)
svm_model.fit(X_train_vectors_tfidf, y_train)

### Predicting

In [47]:
svm_y_pred = svm_model.predict(X_test_vectors_tfidf)
print(classification_report(y_test,svm_y_pred))

              precision    recall  f1-score   support

    negative       0.60      0.66      0.63       441
     neutral       0.53      0.49      0.51       367
    positive       0.60      0.58      0.59       336

    accuracy                           0.58      1144
   macro avg       0.58      0.57      0.57      1144
weighted avg       0.58      0.58      0.58      1144



## Word2Vec

from: https://medium.com/analytics-vidhya/nlp-tutorial-for-text-classification-in-python-8f19cd17b49e

In [48]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [52]:
import gensim
from gensim.models import Word2Vec

In [53]:
X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
X_test_tok= [nltk.word_tokenize(i) for i in X_test]

In [54]:
#building Word2Vec model
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))
    def fit(self, X, y):
        return self
    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

obama['clean_text_tok']=[nltk.word_tokenize(i) for i in obama['tweet']] 
model = Word2Vec(obama['clean_text_tok'],min_count=1) 
w2v = dict(zip(model.wv.index_to_key , model.wv.vectors))   
modelw = MeanEmbeddingVectorizer(w2v)

X_train_vectors_w2v = modelw.transform(X_train_tok)
X_val_vectors_w2v = modelw.transform(X_test_tok)

### Logistic Regression

In [55]:
lr_model2 = LogisticRegression(solver='liblinear',C=10,penalty='l2',random_state=4) #4=57%
lr_model2.fit(X_train_vectors_w2v, y_train)

In [56]:
lr_y_pred2 = lr_model2.predict(X_val_vectors_w2v)
print(classification_report(y_test,lr_y_pred2))

              precision    recall  f1-score   support

    negative       0.45      0.85      0.59       441
     neutral       0.59      0.05      0.10       367
    positive       0.54      0.45      0.49       336

    accuracy                           0.48      1144
   macro avg       0.53      0.45      0.39      1144
weighted avg       0.52      0.48      0.40      1144

