### Import necessary libraries

In [64]:
import pandas as pd
import numpy as np

### Load the training and test datasets.

In [65]:
column_names = ['id', 'processed_tweets', 'sentiment', 'targeted_or_not', 'target_type']
df_train = pd.read_csv('processed_trainingset.csv')[column_names]
df_train.head()

Unnamed: 0,id,processed_tweets,sentiment,targeted_or_not,target_type
0,86426,"['@user', 'ask', 'native', 'american', 'take']",1,0.0,
1,90194,"['@user', '@user', 'go', 'home', 'drunk', '@us...",1,1.0,1.0
2,16820,"['amazon', 'investigate', 'chinese', 'employee...",0,,
3,62688,"['@user', 'someone', 'piece', 'shit', 'volcano']",1,0.0,
4,43605,"['@user', '@user', 'obama', 'want', 'liberal',...",0,,


In [66]:
df_test = pd.read_csv('processed_testset.csv')[column_names]
df_test.head()

Unnamed: 0,id,processed_tweets,sentiment,targeted_or_not,target_type
0,15923,"['whoisq', 'wherestheserver', 'dumpnike', 'dec...",1,1.0,3.0
1,27014,"['constitutionday', 'revere', 'conservative', ...",0,,
2,30530,"['foxnews', 'nra', 'maga', 'potus', 'trump', '...",0,,
3,13876,"['watch', 'boomer', 'get', 'news', 'still', 'p...",0,,
4,60133,"['nopasaran', 'unity', 'demo', 'oppose', 'farr...",1,1.0,2.0


We will now convert the datasets into a form that can be used by the ML algorithms.

In [67]:
from ast import literal_eval

temp = list(df_train['processed_tweets'])
list_of_train_tweets = [None]*df_train.shape[0]
for i in range(df_train.shape[0]):
    list_of_train_tweets[i] = " ".join(literal_eval(temp[i]))
    
temp = list(df_test['processed_tweets'])
list_of_test_tweets = [None]*df_test.shape[0]
for i in range(df_test.shape[0]):
    list_of_test_tweets[i] = " ".join(literal_eval(temp[i]))

#### Function to create a vocabulary

In [68]:
def create_vocabulary(corpus):
    '''
    Creates a vocabulary out of the corpus.
    Input: List of strings, where each string is a processed tweet
    Output: List of words (vocabulary)
    '''
    # Initialize the vocabulary as an empty set
    vocab = {}
    for doc in corpus:
        # For every document, take the set union of the words in that document
        # with words already in the vocab
        words = doc.split(" ")
        vocab = set(vocab).union(words)
    return vocab

#### Labels for Task A

In [69]:
sk_training_labels = list(df_train['sentiment'])
sk_test_labels = list(df_test['sentiment'])

#### Converting the tweets into vectors based on frequency of occurrence

In [70]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a vocabulary
vocab = create_vocabulary(list_of_train_tweets)
count_vector = CountVectorizer(vocabulary=vocab)


# Obtains a numpy array of frequencies of words in the document.
sk_training_tweets = count_vector.fit_transform(list_of_train_tweets).toarray()
sk_test_tweets = count_vector.fit_transform(list_of_test_tweets).toarray()

#### Multinomial Naive Bayes approach with CountVectorizer

In [71]:
from sklearn.naive_bayes import MultinomialNB

# Leaving the parameters at default ensures that the model learns the prior probabilities from the
# data and that it uses Laplace smoothing to deal with new words.
mnb = MultinomialNB()

mnb.fit(sk_training_tweets, sk_training_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [72]:
# Performance on the training set
print("Accuracy on the training set : {} %".format(mnb.score(sk_training_tweets, sk_training_labels)*100))

# Performance on the training set
print("Accuracy on the test set : {} %".format(mnb.score(sk_test_tweets, sk_test_labels)*100))

Accuracy on the training set : 88.13392043729122 %
Accuracy on the test set : 79.18604651162791 %


#### Multinimial Naive Bayes approach with TfidfVectorizer

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a vocabulary
vocab = create_vocabulary(list_of_train_tweets)
tfidf_vector = TfidfVectorizer(vocabulary=vocab)


# Obtains a numpy array of frequencies of words in the document.
sk_training_tweets2 = tfidf_vector.fit_transform(list_of_train_tweets).toarray()
sk_test_tweets2 = tfidf_vector.fit_transform(list_of_test_tweets).toarray()

mnb2 = MultinomialNB()

mnb2.fit(sk_training_tweets2, sk_training_labels)

# Performance on the training set
print("Accuracy on the training set : {} %".format(mnb2.score(sk_training_tweets2, sk_training_labels)*100))

# Performance on the training set
print("Accuracy on the test set : {} %".format(mnb2.score(sk_test_tweets2, sk_test_labels)*100))

Accuracy on the training set : 77.7027027027027 %
Accuracy on the test set : 75.81395348837209 %


### Conclusion

We obtained better a higher accuracy for subtask A by using Multinomial Naive 