<a href="https://colab.research.google.com/github/tarjerw/TDT4173-project-group6/blob/main/ML_project_(BOW).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Political party analysis program that parses the tweets fetched from Twitter using Python

In [None]:
###### IMPORT THE LIBRARIES  #########
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

#import needed libraries
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

##Retrieve data

In [None]:
#retrieves data
tweets2016_df = pd.read_csv('TweetDatabase_2016.csv')
tweets2020_df = pd.read_csv('TweetDatabase_2020.csv')
df_2016 = tweets2016_df 
df_2020 = tweets2020_df 

#removes all duplicates
df_2016 = df_2016.drop_duplicates()
df_2020 = df_2020.drop_duplicates() 
df_2016_2020 = df_2016.append(df_2020)


#shuffles data
df_2016 = df_2016.sample(frac = 1)
df_2020 = df_2020.sample(frac = 1)
df_2016_2020 = df_2016_2020.sample(frac = 1)

####Tweet count

In [None]:
#counts total number of tweets per year
print(df_2016.Tweet.count(),"tweets in 2016")
print(df_2020.Tweet.count(),"tweets in 2020")
print(df_2016_2020.Tweet.count(),"tweets in 2016+2020 \n")


#counts the number of democrate tweets and republican tweets
print("Number of tweets year 2016:")
print(df_2016.Party.value_counts(), '\n')
print("Number of tweets year 2020:")
print(df_2020.Party.value_counts(), '\n')
print("Number of tweets year 2016+2020:")
print(df_2016_2020.Party.value_counts())

6932 tweets in 2016
5732 tweets in 2020
12664 tweets in 2016+2020 

Number of tweets year 2016:
Democrat      3739
Republican    3193
Name: Party, dtype: int64 

Number of tweets year 2020:
Republican    3029
Democrat      2703
Name: Party, dtype: int64 

Number of tweets year 2016+2020:
Democrat      6442
Republican    6222
Name: Party, dtype: int64


##Sentiment Analysis using Bag-of-Words & N-Gram

###Splitting data

In [None]:
# Splits dataset into Training data and Test data, with 20% test data and 80% training data
X_train16, X_test16, y_train16, y_test16 = train_test_split(df_2016.Tweet, df_2016.Party, test_size=0.2)
X_train20, X_test20, y_train20, y_test20 = train_test_split(df_2020.Tweet, df_2020.Party, test_size=0.2)
X_train1620, X_test1620, y_train1620, y_test1620 = train_test_split(df_2016_2020.Tweet, df_2016_2020.Party, test_size=0.2)


print("Year 2016: \n"+"Democratic republican ratio in Training data:\n"+str(y_train16.value_counts())+"\n")
print("Democratic republican ratio in Test data:\n"+str(y_test16.value_counts()),'\n')
print("-------------------------------------------------------\n")
print("Year 2020: \n"+"Democratic republican ratio in Training data:\n"+str(y_train20.value_counts())+"\n")
print("Democratic republican ratio in Test data:\n"+str(y_test20.value_counts()),'\n')
print("-------------------------------------------------------\n")
print("Year 2016+2020: \n"+"Democratic republican ratio in Training data:\n"+str(y_train1620.value_counts())+"\n")
print("Democratic republican ratio in Test data:\n"+str(y_test1620.value_counts()))


Year 2016: 
Democratic republican ratio in Training data:
Democrat      2997
Republican    2548
Name: Party, dtype: int64

Democratic republican ratio in Test data:
Democrat      742
Republican    645
Name: Party, dtype: int64 

-------------------------------------------------------

Year 2020: 
Democratic republican ratio in Training data:
Republican    2402
Democrat      2183
Name: Party, dtype: int64

Democratic republican ratio in Test data:
Republican    627
Democrat      520
Name: Party, dtype: int64 

-------------------------------------------------------

Year 2016+2020: 
Democratic republican ratio in Training data:
Democrat      5154
Republican    4977
Name: Party, dtype: int64

Democratic republican ratio in Test data:
Democrat      1288
Republican    1245
Name: Party, dtype: int64


####Creating Bag-of-Words Model

In [None]:
# Create Bag of words for each election year and preprocess data
# max_df removes all words that are too frequent in the dataset 
# min_df removes all words that occur only once in the dataset 
# stop_words='english' removes most common, irrelevant english words

BagOfWords_vectorizer16 = CountVectorizer(lowercase=True, max_df=0.35, min_df=2, stop_words='english')
BagOfWords_vectorizer20 = CountVectorizer(lowercase=True, max_df=0.35, min_df=2, stop_words='english')
BagOfWords_vectorizer1620 = CountVectorizer(lowercase=True, max_df=0.35, min_df=2, stop_words='english')


# fit(train_data) = learn the vocabulary of the training data
# transform(train_data) = convert the training data into a document term matrix
X_train_BagOfWords16 = BagOfWords_vectorizer16.fit_transform(X_train16) 
#transform(test_data) = use the fitted vocabulary (training) to build a document term matrix from the testing data
X_test_BagOfWords16 = BagOfWords_vectorizer16.transform(X_test16)


X_train_BagOfWords20 = BagOfWords_vectorizer20.fit_transform(X_train20)
X_test_BagOfWords20 = BagOfWords_vectorizer20.transform(X_test20)

X_train_BagOfWords1620 = BagOfWords_vectorizer1620.fit_transform(X_train1620)
X_test_BagOfWords1620 = BagOfWords_vectorizer1620.transform(X_test1620)


####Creating N-Gram Model 

In [None]:
# Create N-Gram for each election year and preprocess data
# max_df removes all words that are too frequent in the dataset 
# consider using TfidfVectorizer() instead of CountVectorizer()
N_Gram_vectorizer16 = CountVectorizer(ngram_range=(1,4), max_df=0.35, min_df=2)
X_train_N_Gram16 = N_Gram_vectorizer16.fit_transform(X_train16)
X_test_N_Gram16 = N_Gram_vectorizer16.transform(X_test16)

N_Gram_vectorizer20 = CountVectorizer(ngram_range=(1,4), max_df=0.35, min_df=2)
X_train_N_Gram20 = N_Gram_vectorizer20.fit_transform(X_train20)
X_test_N_Gram20 = N_Gram_vectorizer20.transform(X_test20)

N_Gram_vectorizer1620 = CountVectorizer(ngram_range=(1,4), max_df=0.35, min_df=2)
X_train_N_Gram1620 = N_Gram_vectorizer1620.fit_transform(X_train1620)
X_test_N_Gram1620 = N_Gram_vectorizer1620.transform(X_test1620)

###Performing cross validation

A key challenge with machine learning, is that we can’t know how well our model will perform on new data until we actually test it.

To address this, we can split our initial dataset into separate training and test subsets.

There are different types of Cross Validation Techniques but the overall concept remains the same:

*   To partition the data into a number of subsets (given as argument for cv in cross_val_score below)
*   Hold out a set at a time and train the model on remaining set
*   Test model on hold out set
*   Repeat the process for each subset of the dataset




A cross validation is done in the code segment below


In [None]:
# Cross-validation is a statistical method used to estimate the skill of machine learning models.
scores_BagOfWords16 = cross_val_score(LogisticRegression(), X_train_BagOfWords16, y_train16, cv=5)
scores_BagOfWords20 = cross_val_score(LogisticRegression(), X_train_BagOfWords20, y_train20, cv=5)
scores_BagOfWords1620 = cross_val_score(LogisticRegression(), X_train_BagOfWords1620, y_train1620, cv=5)

scores_N_Gram16 = cross_val_score(LogisticRegression(), X_train_N_Gram16, y_train16, cv=5)
scores_N_Gram20 = cross_val_score(LogisticRegression(), X_train_N_Gram20, y_train20, cv=5)
scores_N_Gram1620 = cross_val_score(LogisticRegression(), X_train_N_Gram1620, y_train1620, cv=5)



# Returns a table illustrating the cross validation scores of the training sets for both methods in 2016 and 2020.
import TableIt
myList = [
    ["Year      | Method", "Cross Validation scores", "Mean accuracy"],
    ["2016      | Bag-of-Words",scores_BagOfWords16,sum(scores_BagOfWords16)/len(scores_BagOfWords16)],
    ["          | N-Gram",scores_N_Gram16, sum(scores_N_Gram16)/len(scores_N_Gram16)],
    ["2020      | Bag-of-Words",scores_BagOfWords20, sum(scores_BagOfWords20)/len(scores_BagOfWords20)],
    ["          | N-Gram",scores_N_Gram20, sum(scores_N_Gram20)/len(scores_N_Gram20)],
    ["2016+2020 | Bag-of-Words",scores_BagOfWords1620, sum(scores_BagOfWords1620)/len(scores_BagOfWords1620)],
    ["          | N-Gram",scores_N_Gram1620 , sum(scores_N_Gram1620)/len(scores_N_Gram1620)]
]

TableIt.printTable(myList, useFieldNames=True)

+---------------------------------------------------------------------------------------------------------------------+
| Year      | Method                                       | Cross Validation scores                                  |
+----------------------------------------------------------+----------------------------------------------------------+
| 2016      | Bag-of-Words                                 | [0.9251578  0.93327322 0.91974752 0.93146979 0.93327322] |
|           | N-Gram                                       | [0.9386835  0.93056808 0.93597836 0.94138864 0.93327322] |
| 2020      | Bag-of-Words                                 | [0.95528899 0.94547437 0.94220284 0.92693566 0.94438386] |
|           | N-Gram                                       | [0.94874591 0.94656489 0.9476554  0.94220284 0.9389313 ] |
| 2016+2020 | Bag-of-Words                                 | [0.92846571 0.92941757 0.92793682 0.92102665 0.9254689 ] |
|           | N-Gram                    

###Data Prediction

How well does the training data predict the test data?



Bag of Words

In [None]:
# How well does the Bag of Words training data predict the test data
acc_table_train16 = []
acc_table_test16 = []
acc_table_train20 = []
acc_table_test20 = []
acc_table_train1620 = []
acc_table_test1620 = []
param_range= [0.001,0.01,0.05,0.1,0.4,0.6,0.8,1.0,1.2,1.4,1.6,1.8,2.0,2.5,3,3.5,4,5,6,7,8,9,10]


fig, axs = plt.subplots(1, 3, sharey='row', figsize=(22.5,8))

for i in param_range:
  #Apply logistic regression model to training data
  logit_BoW_16 = LogisticRegression(C=i)
  logit_BoW_20 = LogisticRegression(C=i)
  logit_BoW_1620 = LogisticRegression(C=i)


  #Fits the model according to the given training data.
  logit_BoW_16.fit(X_train_BagOfWords16, y_train16) 
  logit_BoW_20.fit(X_train_BagOfWords20, y_train20) 
  logit_BoW_1620.fit(X_train_BagOfWords1620, y_train1620) 


  #Predict test set using logistic regression and a given C
  acc_table_test16.append(logit_BoW_16.score(X_test_BagOfWords16, y_test16))
  acc_table_train16.append(logit_BoW_16.score(X_train_BagOfWords16, y_train16))
  acc_table_test20.append(logit_BoW_20.score(X_test_BagOfWords20, y_test20))
  acc_table_train20.append(logit_BoW_20.score(X_train_BagOfWords20, y_train20))
  acc_table_test1620.append(logit_BoW_1620.score(X_test_BagOfWords1620, y_test1620))
  acc_table_train1620.append(logit_BoW_1620.score(X_train_BagOfWords1620, y_train1620))

axs[0].plot(param_range, acc_table_train16, label="Train score", color="g")
axs[0].plot(param_range, acc_table_test16, label="Test score", color="r")
axs[0].set_title('In 2016')
axs[1].plot(param_range, acc_table_train20, label="Train score", color="g")
axs[1].plot(param_range, acc_table_test20, label="Test score", color="r")
axs[1].set_title('In 2020')
axs[2].plot(param_range, acc_table_train1620, label="Train score", color="g")
axs[2].plot(param_range, acc_table_test1620, label="Test score", color="r")
axs[2].set_title('2016 + 2020')
fig.suptitle('Accuracy of the Bag of Words model in year 2020 using the logistic regression')
axs[0].set(xlabel='C value', ylabel='Accuracy')
axs[1].set(xlabel='C value')
axs[2].set(xlabel='C value')
fig.subplots_adjust(wspace=0.1, hspace=0.1)
axs[0].grid()
axs[1].grid()
axs[2].grid()



myList = [
    ["Year","Machine Learning method","Classifier","Testset accuracy C=1",],
    ["2016", "Bag of Words", "Logistic Regression", round(logit_BoW_16.score(X_test_BagOfWords16, y_test16)*100,3)],
    ["2020", "Bag of Words", "Logistic Regression", round(logit_BoW_20.score(X_test_BagOfWords20, y_test20)*100,3)],
    ["2016+2020", "Bag of Words", "Logistic Regression", round(logit_BoW_1620.score(X_test_BagOfWords1620, y_test1620)*100,3)]

]

TableIt.printTable(myList, useFieldNames=True)
print('')
fig.show()

In [None]:
# Calculate mean and standard deviation for training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Calculate mean and standard deviation for test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Create plot of Standard Deviation of Accuracy Score
plt.plot(param_range, train_std, label="Training Deviation", color="r")
plt.plot(param_range, test_std, label="validation Deviation", color="g")
plt.title("Validation Curve showing Standard Deviation With Logistic Regression")
plt.xlabel("C values")
plt.ylabel("Standard Deviation")
plt.tight_layout()
plt.legend(loc="best")
plt.savefig ("figure1a")
plt.show()


N-Gram

In [None]:
# How well does the N-Gram training data predict the test
# How well does the Bag of Words training data predict the test data
acc_table_train16 = []
acc_table_test16 = []
acc_table_train20 = []
acc_table_test20 = []
acc_table_train1620 = []
acc_table_test1620 = []
param_range= [0.001,0.01,0.05,0.1,0.4,0.6,0.8,1.0,1.2,1.4,1.6,1.8,2.0,2.5,3,3.5,4,5,6,7,8,9,10]


fig, axs = plt.subplots(1, 3, sh¢¢¢arey='row', figsize=(22.5,8))
for i in param_range:
  #Apply logistic regression model to training data
  logit_NG_16 = LogisticRegression(C=i)
  logit_NG_20 = LogisticRegression(C=i)
  logit_NG_1620 = LogisticRegression(C=i)


  #Fits the model according to the given training data.
  logit_NG_16.fit(X_train_N_Gram16, y_train16) 
  logit_NG_20.fit(X_train_N_Gram20, y_train20) 
  logit_NG_1620.fit(X_train_N_Gram1620, y_train1620) 


  #Predict test set using logistic regression and a given C
  acc_table_test16.append(logit_NG_16.score(X_test_N_Gram16, y_test16))
  acc_table_train16.append(logit_NG_16.score(X_train_N_Gram16, y_train16))
  acc_table_test20.append(logit_NG_20.score(X_test_N_Gram20, y_test20))
  acc_table_train20.append(logit_NG_20.score(X_train_N_Gram20, y_train20))
  acc_table_test1620.append(logit_NG_1620.score(X_test_N_Gram1620, y_test1620))
  acc_table_train1620.append(logit_NG_1620.score(X_train_N_Gram1620, y_train1620))

axs[0].plot(param_range, acc_table_train16, label="Train score", color="g")
axs[0].plot(param_range, acc_table_test16, label="Test score", color="r")
axs[0].set_title('In 2016')
axs[1].plot(param_range, acc_table_train20, label="Train score", color="g")
axs[1].plot(param_range, acc_table_test20, label="Test score", color="r")
axs[1].set_title('In 2020')
axs[2].plot(param_range, acc_table_train1620, label="Train score", color="g")
axs[2].plot(param_range, acc_table_test1620, label="Test score", color="r")
axs[2].set_title('2016 + 2020')
fig.suptitle('Accuracy of the N-Gram model in year 2020 using the logistic regression')
axs[0].set(xlabel='C value', ylabel='Accuracy')
axs[1].set(xlabel='C value')
axs[2].set(xlabel='C value')
fig.subplots_adjust(wspace=0.1, hspace=0.1)
axs[0].grid()
axs[1].grid()
axs[2].grid()

myList = [
    ["Year","Machine Learning method","Classifier","Testset accuracy C=1",],
    ["2016", "N-Gram", "Logistic Regression", round(logit_NG_16.score(X_test_N_Gram16, y_test16)*100,3)],
    ["2020", "N-Gram", "Logistic Regression", round(logit_NG_20.score(X_test_N_Gram20, y_test20)*100,3)],
    ["2016+2020", "N-Gram", "Logistic Regression", round(logit_NG_1620.score(X_test_N_Gram1620, y_test1620)*100,3)]

]

TableIt.printTable(myList, useFieldNames=True)
print('')
fig.show()

Logistic Regression is used when the dependent variable (target) is categorical.
Above, we are using a Binary Logistic Regression.
The categorical response has only two 2 possible outcomes: Democrate or Republican.

###Confusion Matrix

Compute confusion matrix to evaluate the accuracy of a classification.

By definition a confusion matrix C is such that Cij is equal to the number of observations known to be in group i and predicted to be in group j.

Thus in our binary classification, the count of true Republican tweets is C[0][0] and false Republican tweets is C[1][0]. Likewise, the count of true Democrat tweets is C[0][1], and false Democrat tweets C[1][1]

In [None]:
pred_lr_bow16 = logit_BoW_16.predict(X_test_BagOfWords16)
pred_lr_bow20 = logit_BoW_20.predict(X_test_BagOfWords20)
pred_lr_bow1620 = logit_BoW_1620.predict(X_test_BagOfWords1620)

pred_lr_ng16 = logit_NG_16.predict(X_test_N_Gram16)
pred_lr_ng20 = logit_NG_20.predict(X_test_N_Gram20)
pred_lr_ng1620 = logit_NG_1620.predict(X_test_N_Gram1620)


confB16 = confusion_matrix(y_test16, pred_lr_bow16)
confB20 = confusion_matrix(y_test20, pred_lr_bow20)
confB1620 = confusion_matrix(y_test1620, pred_lr_bow1620)
confNG16 = confusion_matrix(y_test16, pred_lr_ng16)
confNG20 = confusion_matrix(y_test20, pred_lr_ng20)
confNG1620 = confusion_matrix(y_test1620, pred_lr_ng1620)


print("Confusion matrix")
myList = [
    ["Year","ML method","Classifier","True Democrat", "False Republican","False Democrat", "True Republican"],
    ["2016", "Bag of Words", "Log Reg", confB16[0][0], confB16[0][1], confB16[1][0], confB16[1][1]],
    ["2016", "N-Gram", "Log Reg", confNG16[0][0], confNG16[0][1], confNG16[1][0], confNG16[1][1]],
    ["2020", "Bag of Words", "Log Reg", confB20[0][0], confB20[0][1], confB20[1][0], confB20[1][1]],
    ["2020", "N-Gram", "Log Reg", confNG20[0][0], confNG20[0][1], confNG20[1][0], confNG20[1][1]],
    ["2016+2020", "Bag of Words", "Log Reg", confB1620[0][0], confB1620[0][1], confB1620[1][0], confB1620[1][1]],
    ["2016+2020", "N-Gram", "Log Reg", confNG1620[0][0], confNG1620[0][1], confNG1620[1][0], confNG1620[1][1]]
]

TableIt.printTable(myList, useFieldNames=True)


Confusion matrix
+------------------------------------------------------------------------------------------------------------------------------------+
| Year             | ML method        | Classifier       | True Democrat    | False Republican | False Democrat   | True Republican  |
+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
| 2016             | Bag of Words     | Log Reg          | 691              | 51               | 63               | 582              |
| 2016             | N-Gram           | Log Reg          | 700              | 42               | 50               | 595              |
| 2020             | Bag of Words     | Log Reg          | 485              | 35               | 29               | 598              |
| 2020             | N-Gram           | Log Reg          | 488              | 32               | 27               | 600              |
| 2016+2020        | Bag of Words     