# Obama
## Data Cleaning 

### Loading the data and renaming the dataframe columns

In [485]:
import nltk
import pandas as pd
import re, string
import os
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.probability import FreqDist

#data loading
data = pd.ExcelFile('C:/Users/utsav/OneDrive/UIC/Fall_2023/CS_583/Project/training-Obama-Romney-tweets.xlsx')
obama = pd.read_excel(data, 'Obama')
#data cleaning
obama = obama[1:]
obama = obama.drop(['Unnamed: 0', 'date', 'time', 'Unnamed: 5'], axis=1)
obama = obama.rename(columns={'Unnamed: 4': 'class', 'Anootated tweet': 'tweet'})
performance = pd.DataFrame(columns=['Model','Accuracy', 'Precision', 'Recall', 'F1 Score'])

### Cleaning the data

In [486]:
print(obama['class'].value_counts())

-1            1922
0             1896
1             1653
2             1474
0               82
2               70
-1              46
1               26
irrevelant      23
irrelevant       1
Name: class, dtype: int64


In [487]:
#obama = obama[obama['class'].isin(['-1', '0', '1'])]
obama = obama.dropna()
#obama = obama.drop(['class'], axis=1)

In [488]:
obama_mixed = obama[obama['class'].isin(['2',2])]

In [489]:
obama = obama[obama['class'].isin(['-1', '0', '1','2',2,-1,0,1])]

In [490]:
obama['class']=obama['class'].astype(int)
obama['class']=obama['class'].apply(lambda x: 0 if x==2 else x)

In [491]:
print(obama['class'].value_counts())

 0    3520
-1    1968
 1    1679
Name: class, dtype: int64


In [492]:
obama['sentiment'] = obama['class'].apply(lambda x: 'positive' if x == 1 else 'neutral' if x==0 else 'negative')
#obama['sentiment'] = obama['class'].map({1: 'positive', 0: 'neutral', -1: 'negative'})

#pip install spacy

In [493]:
#python -m spacy download en_core_web_trf

In [494]:
def clean(text):
    text = text.lower()
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)
    text = re.sub(r'www.[^ ]+', '', text)
    text = re.sub(r'[^a-z]', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text

regexp = RegexpTokenizer('\w+')
# import spacy
nltk.download('stopwords')
def tokenize(text):
    
    stop_words = stopwords.words('english')
    text = clean(text)
    #text = nltk.word_tokenize(text)
    text = regexp.tokenize(text)
    text = [w for w in text if w not in stop_words]
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utsav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [495]:
obama['tweet_token'] = obama['tweet'].apply(lambda stext: tokenize(str(stext)))

#remove words with length less than 2
obama['tweet_string'] = obama['tweet_token'].apply(lambda x:' '.join([item for item in x if len(item)>2]))

all_words = ' '.join([text for text in obama['tweet_string']])
tokenized_obama = nltk.tokenize.word_tokenize(all_words)
fdist = FreqDist(tokenized_obama)
obama['tweet_string_fdist'] = obama['tweet_token'].apply(lambda x: ' '.join([item for item in x if fdist[item] > 1 ]))
#fdist
#lemmatize
# nltk.download('wordnet') 
# wordnet_lem = WordNetLemmatizer()
# obama['tweet'] = obama['tweet_string_fdist'].apply(wordnet_lem.lemmatize)

In [496]:
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatiser(text):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(text))  
    wordnet_tagged = map(lambda x: (x[0], pos_tagger(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\utsav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [497]:

obama['tweet'] = obama['tweet_string_fdist'].apply(lambda x: lemmatiser(x))
obama.head(5)

Unnamed: 0,tweet,class,sentiment,tweet_token,tweet_string,tweet_string_fdist
1,wore cap barack obama signature look jason jou...,0,neutral,"[kirkpatrick, wore, baseball, cap, embroidered...",kirkpatrick wore baseball cap embroidered bara...,wore cap barack obama signature look jason jou...
2,question romney obama child contest mitt punch...,0,neutral,"[question, e, romney, e, e, obama, e, child, p...",question romney obama child punching contest m...,question romney obama child contest mitt punch...
3,obama debate cracker as cracker tonight,1,positive,"[e, obama, e, debates, cracker, ass, cracker, ...",obama debates cracker ass cracker tonight tuned,obama debates cracker ass cracker tonight
4,slate blame obama four death libya blame bush ...,0,neutral,"[rt, slate, blame, e, obama, e, four, deaths, ...",slate blame obama four deaths libya blame bush...,slate blame obama four deaths libya blame bush...
5,miss point afraid understand big picture dont ...,0,neutral,"[youre, missing, point, im, afraid, understand...",youre missing point afraid understand bigger p...,missing point afraid understand bigger picture...


In [498]:
obama = obama.drop(['tweet_token', 'tweet_string', 'tweet_string_fdist'], axis=1)
#obama = obama[obama['tweet'].apply(lambda x: len(x.split())>1)]
obama.dropna(inplace=True)
print(obama.shape)
obama.head(5)

(7167, 3)


Unnamed: 0,tweet,class,sentiment
1,wore cap barack obama signature look jason jou...,0,neutral
2,question romney obama child contest mitt punch...,0,neutral
3,obama debate cracker as cracker tonight,1,positive
4,slate blame obama four death libya blame bush ...,0,neutral
5,miss point afraid understand big picture dont ...,0,neutral


In [499]:
print(obama['sentiment'].value_counts())

neutral     3520
negative    1968
positive    1679
Name: sentiment, dtype: int64


In [500]:
obama_mixed['class']=obama_mixed['class'].astype(int)
obama_mixed['sentiment'] = obama_mixed['class'].apply(lambda x: 'mixed' if x == 2 else 'mixed')
obama_mixed['tweet_token'] = obama_mixed['tweet'].apply(lambda stext: tokenize(str(stext)))

#remove words with length less than 2
obama_mixed['tweet_string'] = obama_mixed['tweet_token'].apply(lambda x:' '.join([item for item in x if len(item)>2]))

all_words = ' '.join([text for text in obama_mixed['tweet_string']])
tokenized_obama_mixed = nltk.tokenize.word_tokenize(all_words)
fdist = FreqDist(tokenized_obama_mixed)
obama_mixed['tweet_string_fdist'] = obama_mixed['tweet_token'].apply(lambda x: ' '.join([item for item in x if fdist[item] > 1 ]))
#fdist
#lemmatize
# nltk.download('wordnet') 
# wordnet_lem = WordNetLemmatizer()
# obama_mixed['tweet'] = obama_mixed['tweet_string_fdist'].apply(wordnet_lem.lemmatize)

obama_mixed['tweet'] = obama_mixed['tweet_string_fdist'].apply(lambda x: lemmatiser(x))
obama_mixed.head(5)
obama_mixed = obama_mixed.drop(['tweet_token', 'tweet_string', 'tweet_string_fdist'], axis=1)
#obama_mixed = obama_mixed[obama_mixed['tweet'].apply(lambda x: len(x.split())>1)]
obama_mixed.dropna(inplace=True)
print(obama_mixed.shape)
obama_mixed.head(5)

(1543, 3)


Unnamed: 0,tweet,class,sentiment
2,question romney obama child mitt punch five ob...,2,mixed
4,slate blame obama four death libya blame bush ...,2,mixed
6,mitt romney make money barack obama make money...,2,mixed
9,tonight debate game feel pres obama call romne...,2,mixed
12,guy rather obama critique romney tax plan,2,mixed


In [501]:
import nltk
import pandas as pd
import re, string
import os
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.probability import FreqDist
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, precision_score, recall_score

## Performing train_test_split

In [502]:
#obama = pd.read_csv('C:/Users/utsav/OneDrive/UIC/Fall_2023/CS_583/Project/Cleaned/obama_cleaned.csv')

In [503]:
df_X = obama['tweet']
df_Y = obama['sentiment']
X_train, X_test, y_train, y_test = train_test_split(df_X,df_Y,test_size=0.2,random_state = 1551)
#X_train.head(5)

### TF-IDF

In [504]:

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2))
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

In [505]:
X_mixed = tfidf_vectorizer.transform(obama_mixed['tweet'])

In [506]:
from sklearn.model_selection import GridSearchCV

## NLTK VADER

In [507]:
from nltk.sentiment import SentimentIntensityAnalyzer
obama_vader = obama.copy(deep=True)
analyzer = SentimentIntensityAnalyzer()
obama_vader['polarity'] = obama_vader['tweet'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
obama_vader['predicted'] = obama_vader['polarity'].apply(lambda x: 'positive' if x > 0 else 'neutral' if x==0 else 'negative')

In [508]:
print(classification_report(obama_vader['sentiment'],obama_vader['predicted']))
#performance = pd.DataFrame(columns=['Model','Accuracy', 'Precision', 'Recall', 'F1 Score'])
new_data = {'Model': 'Vader',
            'Accuracy': accuracy_score(obama_vader['sentiment'],obama_vader['predicted']),
            'Precision': precision_score(obama_vader['sentiment'],obama_vader['predicted'], average='weighted'),
            'Recall': recall_score(obama_vader['sentiment'],obama_vader['predicted'], average='weighted'),
            'F1 Score': f1_score(obama_vader['sentiment'],obama_vader['predicted'], average='weighted')}
performance = performance.append(new_data, ignore_index=True)
print(performance.head(5))

              precision    recall  f1-score   support

    negative       0.43      0.47      0.45      1968
     neutral       0.57      0.34      0.43      3520
    positive       0.30      0.52      0.38      1679

    accuracy                           0.42      7167
   macro avg       0.43      0.45      0.42      7167
weighted avg       0.47      0.42      0.42      7167

   Model  Accuracy  Precision    Recall  F1 Score
0  Vader  0.420678   0.467917  0.420678  0.424098


  performance = performance.append(new_data, ignore_index=True)


## Logistic Regression

### fitting the model

In [509]:
from sklearn.linear_model import LogisticRegression

In [510]:
lr_model = LogisticRegression(solver='saga',C=5,penalty='l2',random_state=44) #4=57%
lr_model.fit(X_train_vectors_tfidf, y_train)

#### Predicting

In [511]:
lr_y_pred = lr_model.predict(X_test_vectors_tfidf)
print(classification_report(y_test,lr_y_pred))
new_data = {'Model': 'Logistic Regression',
            'Accuracy': accuracy_score(y_test,lr_y_pred),
            'Precision': precision_score(y_test,lr_y_pred, average='weighted'),
            'Recall': recall_score(y_test,lr_y_pred, average='weighted'),
            'F1 Score': f1_score(y_test,lr_y_pred, average='weighted')}
performance = performance.append(new_data, ignore_index=True)
print(performance.head(5))

              precision    recall  f1-score   support

    negative       0.59      0.58      0.59       385
     neutral       0.66      0.76      0.71       695
    positive       0.63      0.47      0.54       354

    accuracy                           0.64      1434
   macro avg       0.63      0.60      0.61      1434
weighted avg       0.64      0.64      0.63      1434

                 Model  Accuracy  Precision    Recall  F1 Score
0                Vader  0.420678   0.467917  0.420678  0.424098
1  Logistic Regression  0.638773   0.636332  0.638773  0.633323


  performance = performance.append(new_data, ignore_index=True)


## Naive Bayes

### fitting the model

In [512]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB

nb_model = MultinomialNB()
nb_model.fit(X_train_vectors_tfidf, y_train)

### predicting

In [513]:
nb_y_pred = nb_model.predict(X_test_vectors_tfidf)
print(classification_report(y_test,nb_y_pred))
new_data = {'Model': 'Naive Bayes',
            'Accuracy': accuracy_score(y_test,nb_y_pred),
            'Precision': precision_score(y_test,nb_y_pred, average='weighted'),
            'Recall': recall_score(y_test,nb_y_pred, average='weighted'),
            'F1 Score': f1_score(y_test,nb_y_pred, average='weighted')}
performance = performance.append(new_data, ignore_index=True)

              precision    recall  f1-score   support

    negative       0.70      0.16      0.26       385
     neutral       0.52      0.96      0.67       695
    positive       0.84      0.12      0.21       354

    accuracy                           0.54      1434
   macro avg       0.69      0.41      0.38      1434
weighted avg       0.65      0.54      0.45      1434



  performance = performance.append(new_data, ignore_index=True)


In [514]:
#for i in range(1000,9999):
    

In [515]:
performance.head(5)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Vader,0.420678,0.467917,0.420678,0.424098
1,Logistic Regression,0.638773,0.636332,0.638773,0.633323
2,Naive Bayes,0.539052,0.645972,0.539052,0.446748


## SVM

### Fitting the model

In [516]:
from sklearn import model_selection, svm

svm_model = svm.SVC(kernel='linear', random_state=4)
svm_model.fit(X_train_vectors_tfidf, y_train)

### Predicting

In [517]:
svm_y_pred = svm_model.predict(X_test_vectors_tfidf)
print(classification_report(y_test,svm_y_pred))
new_data = {'Model': 'SVM',
            'Accuracy': accuracy_score(y_test,svm_y_pred),
            'Precision': precision_score(y_test,svm_y_pred, average='weighted'),
            'Recall': recall_score(y_test,svm_y_pred, average='weighted'),
            'F1 Score': f1_score(y_test,svm_y_pred, average='weighted')}
performance = performance.append(new_data, ignore_index=True)

              precision    recall  f1-score   support

    negative       0.60      0.60      0.60       385
     neutral       0.67      0.76      0.71       695
    positive       0.63      0.47      0.54       354

    accuracy                           0.64      1434
   macro avg       0.64      0.61      0.62      1434
weighted avg       0.64      0.64      0.64      1434



  performance = performance.append(new_data, ignore_index=True)


In [518]:
print(X_test_vectors_tfidf)

  (0, 15147)	0.0696471516955555
  (0, 16077)	0.6059808658688938
  (0, 19344)	0.5122339799252629
  (0, 23872)	0.35322438266634676
  (0, 23901)	0.4906988381496186
  (1, 2160)	0.30183119241617407
  (1, 5275)	0.12570973870394758
  (1, 5500)	0.3727417167244743
  (1, 12414)	0.2629677625697816
  (1, 12438)	0.16558683028868248
  (1, 13480)	0.25137442887384664
  (1, 15147)	0.04284029471912051
  (1, 15424)	0.21440017978811832
  (1, 19121)	0.20243378622854646
  (1, 19152)	0.28893002028231923
  (1, 19583)	0.23884969275686901
  (1, 21976)	0.3727417167244743
  (1, 23029)	0.3057957047406717
  (1, 23922)	0.18118553672102053
  (1, 23980)	0.2231082901435064
  (1, 25813)	0.22368203573607245
  (2, 849)	0.226846685206795
  (2, 3036)	0.20706274445730258
  (2, 3057)	0.3599853456977993
  (2, 3989)	0.2138549390025235
  :	:
  (1432, 14546)	0.1996691115045924
  (1432, 15147)	0.045512536497552575
  (1432, 15214)	0.3653616317561438
  (1432, 19714)	0.30142153513868825
  (1432, 20798)	0.16506372085838966
  (1432, 20

In [519]:
print(X_mixed.shape)
print(X_test_vectors_tfidf.shape)
print(X_train_vectors_tfidf.shape)

(1543, 27031)
(1434, 27031)
(5733, 27031)


In [520]:
performance.head(5)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Vader,0.420678,0.467917,0.420678,0.424098
1,Logistic Regression,0.638773,0.636332,0.638773,0.633323
2,Naive Bayes,0.539052,0.645972,0.539052,0.446748
3,SVM,0.644351,0.642072,0.644351,0.63876


In [521]:
mixed_pred = svm_model.predict(X_mixed)
print(mixed_pred)


['neutral' 'neutral' 'neutral' ... 'neutral' 'neutral' 'neutral']


## KNN

In [522]:
# #your code here
# from sklearn.neighbors import KNeighborsClassifier
# clf = KNeighborsClassifier(n_neighbors = 15)
# clf.fit(X_train_vectors_tfidf, y_train)
# y_pred_sklearn = clf.predict(X_test_vectors_tfidf)
# print(classification_report(y_test,y_pred_sklearn))
# new_data = {'Model': 'KNN',
#             'Accuracy': accuracy_score(y_test,y_pred_sklearn),
#             'Precision': precision_score(y_test,y_pred_sklearn, average='weighted'),
#             'Recall': recall_score(y_test,y_pred_sklearn, average='weighted'),
#             'F1 Score': f1_score(y_test,y_pred_sklearn, average='weighted')}
# performance = performance.append(new_data, ignore_index=True)

In [523]:
from sklearn.model_selection import GridSearchCV#create new a knn model
from sklearn.neighbors import KNeighborsClassifier
knn2 = KNeighborsClassifier()#create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(1, 25)}#use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(knn2, param_grid, cv=5)#fit model to data
clf = knn_gscv.fit(X_train_vectors_tfidf, y_train)
y_pred_sklearn = clf.predict(X_test_vectors_tfidf)
print(classification_report(y_test,y_pred_sklearn))
new_data = {'Model': 'KNN',
            'Accuracy': accuracy_score(y_test,y_pred_sklearn),
            'Precision': precision_score(y_test,y_pred_sklearn, average='weighted'),
            'Recall': recall_score(y_test,y_pred_sklearn, average='weighted'),
            'F1 Score': f1_score(y_test,y_pred_sklearn, average='weighted')}
performance = performance.append(new_data, ignore_index=True)

              precision    recall  f1-score   support

    negative       0.61      0.30      0.41       385
     neutral       0.59      0.72      0.65       695
    positive       0.47      0.53      0.49       354

    accuracy                           0.56      1434
   macro avg       0.56      0.52      0.52      1434
weighted avg       0.57      0.56      0.55      1434



  performance = performance.append(new_data, ignore_index=True)


In [524]:
performance.head(10)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Vader,0.420678,0.467917,0.420678,0.424098
1,Logistic Regression,0.638773,0.636332,0.638773,0.633323
2,Naive Bayes,0.539052,0.645972,0.539052,0.446748
3,SVM,0.644351,0.642072,0.644351,0.63876
4,KNN,0.560669,0.566667,0.560669,0.546546


## XGBoost

### Creating a new train_test_split

In [525]:
obama_xgb = obama.copy(deep=True)
obama_xgb.drop(['sentiment'], axis=1)
obama_xgb['class'] = obama_xgb['class'].map({1: 1, 0: 0 ,-1: 2})
df_X_xgb = obama_xgb['tweet']
df_Y_xgb = obama_xgb['class']
X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(df_X_xgb,df_Y_xgb,test_size=0.2)

### TF-IDF

In [526]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features=10000)
X_train_vectors_tfidf_xgb = tfidf_vectorizer.fit_transform(X_train_xgb)
X_test_vectors_tfidf_xgb = tfidf_vectorizer.transform(X_test_xgb)

### Fitting the model

In [527]:
obama_xgb.head()

Unnamed: 0,tweet,class,sentiment
1,wore cap barack obama signature look jason jou...,0,neutral
2,question romney obama child contest mitt punch...,0,neutral
3,obama debate cracker as cracker tonight,1,positive
4,slate blame obama four death libya blame bush ...,0,neutral
5,miss point afraid understand big picture dont ...,0,neutral


In [528]:
from xgboost import XGBClassifier

In [529]:
xgb = XGBClassifier(n_estimators=1000, max_depth=15, eta=0.1, subsample=0.7, colsample_bytree=1)

xgb.fit(X_train_vectors_tfidf_xgb,y_train_xgb)
xgb.score(X_test_vectors_tfidf_xgb,y_test_xgb)

0.6387726638772664

In [530]:
y_pred = xgb.predict(X_test_vectors_tfidf_xgb)
print(y_pred)

[2 1 0 ... 2 0 0]


## Word2Vec

from: https://medium.com/analytics-vidhya/nlp-tutorial-for-text-classification-in-python-8f19cd17b49e

In [531]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [532]:
import gensim
from gensim.models import Word2Vec

In [533]:
X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
X_test_tok= [nltk.word_tokenize(i) for i in X_test]

In [534]:
#building Word2Vec model
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))
    def fit(self, X, y):
        return self
    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

obama['clean_text_tok']=[nltk.word_tokenize(i) for i in obama['tweet']] 
model = Word2Vec(obama['clean_text_tok'],min_count=1) 
w2v = dict(zip(model.wv.index_to_key , model.wv.vectors))   
modelw = MeanEmbeddingVectorizer(w2v)

X_train_vectors_w2v = modelw.transform(X_train_tok)
X_val_vectors_w2v = modelw.transform(X_test_tok)

### Logistic Regression

In [535]:
lr_model2 = LogisticRegression(solver='liblinear',C=10,penalty='l2',random_state=4) #4=57%
lr_model2.fit(X_train_vectors_w2v, y_train)

In [536]:
lr_y_pred2 = lr_model2.predict(X_val_vectors_w2v)
print(classification_report(y_test,lr_y_pred2))

              precision    recall  f1-score   support

    negative       0.68      0.09      0.16       385
     neutral       0.51      0.96      0.66       695
    positive       0.63      0.11      0.19       354

    accuracy                           0.52      1434
   macro avg       0.61      0.39      0.34      1434
weighted avg       0.58      0.52      0.41      1434

