## ANLY580 Project 1

### 404 Not Found -- Heng Zhou, Hongyang Zheng, Zhengqian Xu

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.util import ngrams 
from collections import Counter
import string
from statistics import stdev
from nltk.corpus import stopwords 
nltk.download('stopwords')
from nltk.classify import NaiveBayesClassifier
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from prettytable import PrettyTable
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hongyang_zheng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/hongyang_zheng/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## Task 2: Message Polarity

### Read Data

In [3]:
# Import data without Emoji
def read(dataset):
    
    with open('./data/Gold/'+dataset, encoding="utf-8") as file:
        train=[]
        train_y = []
        for line in file:
            tweet_train=line.split('\t')[2] 
            tweet_train=tweet_train.encode('ascii', 'ignore').decode('ascii')
            train.append(tweet_train[:-1])
            train_y.append(line.split('\t')[1])
    return train, train_y

train, train_y = read('train.txt')
dev, dev_y = read('dev.txt')
devt, devt_y = read('devtest.txt')
test, test_y = read('test.txt')    

### Preprocessing

Next, we will write functions to preprocess the data in the following aspects:
* remove URL
* remove stopwords
* remove Profiles
* remove Hashtags
* convert to lower case

In [4]:
# Preprocessing functions
def process_tweets(tweets):
    
    tweets_hash=[]
    tweets_punc=[]
    tweets_clean=[]

    for tweet in tweets:
          
        # Process text
        tokens = nltk.casual_tokenize(tweet)

        # Remove URL and stopwords
        tokens =[term for term in tokens if not term.startswith('https://') ]
        tokens =[term for term in tokens if not term.startswith('http://') ]
        
        # Remove stopwords
        stop_words = list(set(stopwords.words('english')))
        tokens = [term for term in tokens if not term in stop_words]

        # Remove profiles
        tokens1 = [term for term in tokens if not term.startswith('@')]
        
        # convert to lower case
        words = [w.lower() for w in tokens1]
        
        # tweets with hashtag and punctuation
        sentence1 = ' '.join(words)
        tweets_hash.append(sentence1)

        # Remove hashtags
        tokens2 = [term for term in words if not term.startswith('#')]
        
        # tweets with punctuation
        sentence2 = ' '.join(tokens2)
        tweets_punc.append(sentence2)
        
        # remove punctuations
        punctuation = list(string.punctuation)
        clean = [term for term in tokens2 if term not in punctuation]
        
        # clean tweets
        sentence3 = ' '.join(clean)
        tweets_clean.append(sentence3)
         
    return tweets_hash, tweets_clean, tweets_punc

In [5]:
# Create different datasets
train_hash, train_clean, train_punc = process_tweets(train)

dev_hash, dev_clean, dev_punc = process_tweets(dev)

devt_hash, devt_clean, devt_punc = process_tweets(devt)

test_hash, test_clean, test_punc = process_tweets(test)

Next, we will try some stemming/lemmatization. 

In [6]:
def stem(data):
    lemmatiser = WordNetLemmatizer()
    stemmer = PorterStemmer()
    sentence_stem = []
    
    for tweet in data:
        
        sentence = tweet.split()  
        stem = [stemmer.stem(word) for word in sentence]
        sentence_stem.append(" ".join(stem))
    
    return sentence_stem

###  Extract Features

In [7]:
def vectorizer(X_train, X_test):
    
    # Initialize vectorizer
    vectorizer = CountVectorizer(lowercase=False)
    # Fit
    vectorizer.fit(X_train)
    
    # Transform X_train and X_test
    X_train_dtm = vectorizer.transform(X_train)
    X_test_dtm = vectorizer.transform(X_test)
    
    return X_train_dtm, X_test_dtm

# Convert label to a numerical variable
def convert_label(x, y):
    
    # Convert to dataframe
    df_dict = {'text':x,'label':y}
    df = pd.DataFrame(df_dict)
    
    # Delete NA
    df=df[df['text']!= ""]
    
    # Label
    df['label'] = df['label'].map({'positive':0, 'neutral':1, 'negative':2})
    X = df['text']
    Y = df['label']
    
    return X, Y

### Lexicon-based Sentiment Analyzers -- Vader

In [8]:
# Function for Vader and store evaluation score
def Vader(tweets, Y_test):
    
    Y_prediction=[]
    analyzer = SentimentIntensityAnalyzer()
    
    for tweet in tweets:
        vs = analyzer.polarity_scores(tweet)
        if vs['compound'] >= 0.05 : 
            Y_prediction.append("positive")
        elif vs['compound'] <= - 0.05 : 
            Y_prediction.append("negative") 
        else: 
            Y_prediction.append("neutral")
            
    target_names = ['positive', 'neutral', 'negative']
    print('Classification Report for Vader:')
    print(classification_report(y_true=Y_test, y_pred=Y_prediction, target_names=target_names))
    
    precision,recall,fscore=score(Y_test, Y_prediction)[:3]
    res=[]
    res.append('Vader')
    for x in fscore:
        res.append(round(x,2))
    res.append(round(np.mean(fscore),2))
 
    for x in recall:
        res.append(round(x,2))
    res.append(round(np.mean(recall),2))

    for x in precision:
        res.append(round(x,2))
    res.append(round(np.mean(precision),2))
    
    return res

### Naive Bayes

In [9]:
# Function for Naive Bayes and store evaluation score
def Naive_bayes(X_train, Y_train, X_test, Y_test,a, if_fit=True,class_prob = None):
    
    nb = MultinomialNB(alpha=a, fit_prior=if_fit, class_prior= class_prob)
    nb.fit(X_train, Y_train)
    Y_prediction = nb.predict(X_test)
    
    target_names = ['positive', 'neutral', 'negative']
    print('Classification Report for Naive Bayes:')
    print(classification_report(y_true=Y_test, y_pred=Y_prediction, target_names=target_names))

    precision,recall,fscore = score(Y_test, Y_prediction)[:3]
    
    res=[]
    res.append('Naive Bayes')
    for x in fscore:
        res.append(round(x,2))
    res.append(round(np.mean(fscore),2))
 
    for x in recall:
        res.append(round(x,2))
    res.append(round(np.mean(recall),2))

    for x in precision:
        res.append(round(x,2))
    res.append(round(np.mean(precision),2))
    
    return res      

### Logistic Regression

In [10]:
# Function for logistic and store evaluation score
def Logistic(X_train_dtm, Y_train, X_test_dtm, Y_test, c, n):
    
    logreg = LogisticRegression(C=c, max_iter =n, class_weight='balanced', solver='newton-cg')
    logreg.fit(X_train_dtm, Y_train)
    Y_prediction = logreg.predict(X_test_dtm)
    
    target_names = ['positive', 'neutral', 'negative']
    print('Classification Report for Logistic Regression:')
    print(classification_report(y_true=Y_test, y_pred=Y_prediction, target_names=target_names))
    
    precision,recall,fscore=score(Y_test, Y_prediction)[:3]
    
    res=[]
    res.append('Logistic')
    for x in fscore:
        res.append(round(x,2))
    res.append(round(np.mean(fscore),2))
 
    for x in recall:
        res.append(round(x,2))
    res.append(round(np.mean(recall),2))

    for x in precision:
        res.append(round(x,2))
    res.append(round(np.mean(precision),2))
    
    return res  

## Task 3: Performance Evaluation

In [11]:
# Print performance table
def performance(A1,A2,A3):
    t = PrettyTable(['Classifier', 'F1+', 'F1|', 'F1-', 'F1_avg', 'R+', 'R|', 'R-', 'R_avg', \
                   'P+', 'P|', 'P-', 'P_avg'])
    t.add_row(A1)
    t.add_row(A2)
    t.add_row(A3) 
    print(t)

## Experiments

Before starting experiments, we tried different combinations of datasets as training data and compared the relative  performance. Finally we decided to use `train+dev+test` as training data and `devt` as test data.

Also, we tried different parameters for naive bayes and logistic regression and use the best model for below experiments. 

Below Experiments are based on different preprocessing and with/without stemming.

### Experiment 1 -- Use Clean Data

In [12]:
# Model 1
model_x = train_clean+dev_clean+test_clean
model_y = train_y+ dev_y+test_y

x_test = devt_clean[:]
y_test = devt_y[:]
X_train, Y_train=convert_label(model_x, model_y)
X_test, Y_test=convert_label(x_test, y_test)
X_train_dtm, X_test_dtm=vectorizer(X_train, X_test)

Vader_Score1=Vader(model_x, model_y)
Naive_Bayes_Score1=Naive_bayes(X_train_dtm, Y_train, X_test_dtm, Y_test, a=1.0, if_fit=True, class_prob = [0.2,0.3,0.5])
Log_Score1=Logistic(X_train_dtm, Y_train, X_test_dtm, Y_test, c=0.1, n=100)

Classification Report for Vader:
             precision    recall  f1-score   support

   positive       0.37      0.57      0.45      4485
    neutral       0.60      0.36      0.45     13150
   negative       0.52      0.66      0.58     10996

avg / total       0.53      0.51      0.50     28631

Classification Report for Naive Bayes:
             precision    recall  f1-score   support

   positive       0.69      0.57      0.62       994
    neutral       0.41      0.46      0.43       681
   negative       0.40      0.53      0.45       325

avg / total       0.55      0.52      0.53      2000

Classification Report for Logistic Regression:
             precision    recall  f1-score   support

   positive       0.70      0.60      0.65       994
    neutral       0.45      0.51      0.48       681
   negative       0.42      0.49      0.45       325

avg / total       0.57      0.55      0.56      2000



In [13]:
performance(Vader_Score1, Naive_Bayes_Score1, Log_Score1)

+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+
|  Classifier | F1+  | F1|  | F1-  | F1_avg |  R+  |  R|  |  R-  | R_avg |  P+  |  P|  |  P-  | P_avg |
+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+
|    Vader    | 0.45 | 0.45 | 0.58 |  0.5   | 0.57 | 0.36 | 0.66 |  0.53 | 0.37 | 0.6  | 0.52 |  0.5  |
| Naive Bayes | 0.62 | 0.43 | 0.45 |  0.5   | 0.57 | 0.46 | 0.53 |  0.52 | 0.69 | 0.41 | 0.4  |  0.5  |
|   Logistic  | 0.65 | 0.48 | 0.45 |  0.53  | 0.6  | 0.51 | 0.49 |  0.54 | 0.7  | 0.45 | 0.42 |  0.53 |
+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+


### Experiment 2 -- Use Data with Hashtag and Punctuation 

In [14]:
# model 2
model_x = train_hash+dev_hash+test_hash
model_y = train_y+ dev_y+test_y

x_test = devt_hash[:]
y_test = devt_y[:]

X_train, Y_train=convert_label(model_x, model_y)
X_test, Y_test=convert_label(x_test, y_test)
X_train_dtm, X_test_dtm=vectorizer(X_train, X_test)

Vader_Score2=Vader(model_x, model_y)
Naive_Bayes_Score2=Naive_bayes(X_train_dtm, Y_train, X_test_dtm, Y_test, a=1.0, if_fit=True, class_prob = [0.2,0.3,0.5])
Log_Score2=Logistic(X_train_dtm, Y_train, X_test_dtm, Y_test, c=0.2, n=100)

Classification Report for Vader:
             precision    recall  f1-score   support

   positive       0.37      0.57      0.45      4485
    neutral       0.60      0.36      0.45     13150
   negative       0.52      0.66      0.58     10996

avg / total       0.53      0.51      0.50     28631

Classification Report for Naive Bayes:
             precision    recall  f1-score   support

   positive       0.69      0.56      0.62       994
    neutral       0.41      0.45      0.43       681
   negative       0.39      0.51      0.44       325

avg / total       0.54      0.52      0.53      2000

Classification Report for Logistic Regression:
             precision    recall  f1-score   support

   positive       0.70      0.61      0.65       994
    neutral       0.46      0.51      0.48       681
   negative       0.43      0.49      0.46       325

avg / total       0.57      0.56      0.56      2000



In [15]:
performance(Vader_Score2, Naive_Bayes_Score2, Log_Score2)

+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+
|  Classifier | F1+  | F1|  | F1-  | F1_avg |  R+  |  R|  |  R-  | R_avg |  P+  |  P|  |  P-  | P_avg |
+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+
|    Vader    | 0.45 | 0.45 | 0.58 |  0.5   | 0.57 | 0.36 | 0.66 |  0.53 | 0.37 | 0.6  | 0.52 |  0.5  |
| Naive Bayes | 0.62 | 0.43 | 0.44 |  0.5   | 0.56 | 0.45 | 0.51 |  0.51 | 0.69 | 0.41 | 0.39 |  0.49 |
|   Logistic  | 0.65 | 0.48 | 0.46 |  0.53  | 0.61 | 0.51 | 0.49 |  0.54 | 0.7  | 0.46 | 0.43 |  0.53 |
+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+


From above result, we can see that compared with clean data, data with hashtag and punctuation does not improve the model performance. Actually, the performance of naive bayes model decreases.

### Experiment 3 -- Use Data with Punctuation 

In [16]:
# model 3
model_x = train_punc+dev_punc+test_punc
model_y = train_y+ dev_y+test_y

x_test = devt_punc[:]
y_test = devt_y[:]

X_train, Y_train=convert_label(model_x, model_y)
X_test, Y_test=convert_label(x_test, y_test)
X_train_dtm, X_test_dtm=vectorizer(X_train, X_test)

Vader_Score3=Vader(model_x, model_y)
Naive_Bayes_Score3=Naive_bayes(X_train_dtm, Y_train, X_test_dtm, Y_test, a=1.0, if_fit=True, class_prob = [0.2,0.3,0.5])
Log_Score3=Logistic(X_train_dtm, Y_train, X_test_dtm, Y_test, c=0.1, n=100)

Classification Report for Vader:
             precision    recall  f1-score   support

   positive       0.37      0.57      0.45      4485
    neutral       0.60      0.36      0.45     13150
   negative       0.52      0.66      0.58     10996

avg / total       0.53      0.51      0.50     28631

Classification Report for Naive Bayes:
             precision    recall  f1-score   support

   positive       0.69      0.57      0.62       994
    neutral       0.41      0.46      0.43       681
   negative       0.40      0.53      0.45       325

avg / total       0.55      0.52      0.53      2000

Classification Report for Logistic Regression:
             precision    recall  f1-score   support

   positive       0.70      0.60      0.65       994
    neutral       0.45      0.51      0.48       681
   negative       0.42      0.49      0.45       325

avg / total       0.57      0.55      0.56      2000



In [17]:
performance(Vader_Score3, Naive_Bayes_Score3, Log_Score3)

+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+
|  Classifier | F1+  | F1|  | F1-  | F1_avg |  R+  |  R|  |  R-  | R_avg |  P+  |  P|  |  P-  | P_avg |
+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+
|    Vader    | 0.45 | 0.45 | 0.58 |  0.5   | 0.57 | 0.36 | 0.66 |  0.53 | 0.37 | 0.6  | 0.52 |  0.5  |
| Naive Bayes | 0.62 | 0.43 | 0.45 |  0.5   | 0.57 | 0.46 | 0.53 |  0.52 | 0.69 | 0.41 | 0.4  |  0.5  |
|   Logistic  | 0.65 | 0.48 | 0.45 |  0.53  | 0.6  | 0.51 | 0.49 |  0.54 | 0.7  | 0.45 | 0.42 |  0.53 |
+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+


Data only with punctuation seems as good as the clean data.

### Experiment 4 -- Use Data with Punctuation and Stemming

In [18]:
# Create stemmed datasets - data with punctuation
train_stem = stem(train_punc)
dev_stem = stem(dev_punc)
devt_stem = stem(devt_punc)
test_stem = stem(test_punc)

In [19]:
# model 4
model_x = train_stem+dev_stem+test_stem
model_y = train_y+ dev_y+test_y

x_test = devt_stem[:]
y_test = devt_y[:]

X_train, Y_train=convert_label(model_x, model_y)
X_test, Y_test=convert_label(x_test, y_test)
X_train_dtm, X_test_dtm=vectorizer(X_train, X_test)

Vader_Score4=Vader(model_x, model_y)
Naive_Bayes_Score4=Naive_bayes(X_train_dtm, Y_train, X_test_dtm, Y_test, a=1.0, if_fit=True, class_prob = [0.2,0.3,0.5])
Log_Score4=Logistic(X_train_dtm, Y_train, X_test_dtm, Y_test, c=0.5, n=100)

Classification Report for Vader:
             precision    recall  f1-score   support

   positive       0.38      0.53      0.44      4485
    neutral       0.56      0.42      0.48     13150
   negative       0.51      0.58      0.55     10996

avg / total       0.51      0.50      0.50     28631

Classification Report for Naive Bayes:
             precision    recall  f1-score   support

   positive       0.70      0.55      0.62       994
    neutral       0.41      0.42      0.41       681
   negative       0.37      0.58      0.45       325

avg / total       0.55      0.51      0.52      2000

Classification Report for Logistic Regression:
             precision    recall  f1-score   support

   positive       0.70      0.62      0.66       994
    neutral       0.46      0.48      0.47       681
   negative       0.40      0.51      0.45       325

avg / total       0.57      0.55      0.56      2000



In [20]:
performance(Vader_Score4, Naive_Bayes_Score4, Log_Score4)

+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+
|  Classifier | F1+  | F1|  | F1-  | F1_avg |  R+  |  R|  |  R-  | R_avg |  P+  |  P|  |  P-  | P_avg |
+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+
|    Vader    | 0.44 | 0.48 | 0.55 |  0.49  | 0.53 | 0.42 | 0.58 |  0.51 | 0.38 | 0.56 | 0.51 |  0.48 |
| Naive Bayes | 0.62 | 0.41 | 0.45 |  0.49  | 0.55 | 0.42 | 0.58 |  0.52 | 0.7  | 0.41 | 0.37 |  0.49 |
|   Logistic  | 0.66 | 0.47 | 0.45 |  0.53  | 0.62 | 0.48 | 0.51 |  0.54 | 0.7  | 0.46 | 0.4  |  0.52 |
+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+


The performance of this kind of combination of preprocessing is not good.

### Experiment 5 -- Use Clean Data with Stemming

In [21]:
# Create stemmed datasets - clean data
train_stem = stem(train_clean)
dev_stem = stem(dev_clean)
devt_stem = stem(devt_clean)
test_stem = stem(test_clean)

In [22]:
model_x = train_stem+dev_stem+test_stem
model_y = train_y+ dev_y+test_y

x_test = devt_stem[:]
y_test = devt_y[:]

X_train, Y_train=convert_label(model_x, model_y)
X_test, Y_test=convert_label(x_test, y_test)
X_train_dtm, X_test_dtm=vectorizer(X_train, X_test)

Vader_Score5=Vader(model_x, model_y)
Naive_Bayes_Score5=Naive_bayes(X_train_dtm, Y_train, X_test_dtm, Y_test, a=1.0, if_fit=True, class_prob = [0.2,0.3,0.5])
Log_Score5=Logistic(X_train_dtm, Y_train, X_test_dtm, Y_test, c=0.1, n=100)

Classification Report for Vader:
             precision    recall  f1-score   support

   positive       0.38      0.53      0.44      4485
    neutral       0.56      0.42      0.48     13150
   negative       0.51      0.58      0.55     10996

avg / total       0.51      0.50      0.50     28631

Classification Report for Naive Bayes:
             precision    recall  f1-score   support

   positive       0.70      0.55      0.62       994
    neutral       0.41      0.42      0.41       681
   negative       0.37      0.58      0.45       325

avg / total       0.55      0.51      0.52      2000

Classification Report for Logistic Regression:
             precision    recall  f1-score   support

   positive       0.70      0.61      0.65       994
    neutral       0.45      0.49      0.47       681
   negative       0.40      0.50      0.45       325

avg / total       0.57      0.55      0.56      2000



In [23]:
performance(Vader_Score5, Naive_Bayes_Score5, Log_Score5)

+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+
|  Classifier | F1+  | F1|  | F1-  | F1_avg |  R+  |  R|  |  R-  | R_avg |  P+  |  P|  |  P-  | P_avg |
+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+
|    Vader    | 0.44 | 0.48 | 0.55 |  0.49  | 0.53 | 0.42 | 0.58 |  0.51 | 0.38 | 0.56 | 0.51 |  0.48 |
| Naive Bayes | 0.62 | 0.41 | 0.45 |  0.49  | 0.55 | 0.42 | 0.58 |  0.52 | 0.7  | 0.41 | 0.37 |  0.49 |
|   Logistic  | 0.65 | 0.47 | 0.45 |  0.52  | 0.61 | 0.49 | 0.5  |  0.53 | 0.7  | 0.45 | 0.4  |  0.52 |
+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+


This is also not a good choice. So we finally choose to use the preprocessing method for clean data to preprocess the input data. For the model part, logistic model seems always outperform the other two methods regardless how we preprocess the dataset. So we will use logistic model. 

## Short Answers

### Q1 Speculate on the differences between the two performance measures above.

$$ 
Recall = \frac{True Positive}{Actual Positive} =  \frac{True Positive}{True Positive+False Negative}  \\
Precision = \frac{True Positive}{Predict Positive} =  \frac{True Positive}{True Positive+False Positive}  \\
F1 score = 2*\frac{Precision*Recall}{Precision+Recall}
$$

Recall refers to the fraction of relevant instances that have been retrieved over the total amount of relevant instances. For example, the $R_+$ means fraction of positive tweets we have captured among all actual positive tweets.   

Precision refers to the fraction of relevant instances among the retrieved instances. For example, the $P_+$ means fraction of positive tweets we have captured among all predicted positive tweets.  

 F1-Score is the weighted average of Precision and Recall

### Q2 Look at your results and find / show examples where your classifiers have mis-performed

In [24]:
model_x = train_clean+dev_clean+test_clean
model_y = train_y+ dev_y+test_y

x_test = devt_clean[:]
y_test = devt_y[:]
X_train, Y_train=convert_label(model_x, model_y)
X_test, Y_test=convert_label(x_test, y_test)
X_train_dtm, X_test_dtm=vectorizer(X_train, X_test)

In [25]:
logreg = LogisticRegression(C=0.1, max_iter =100, class_weight='balanced', solver='newton-cg')
logreg.fit(X_train_dtm, Y_train)
Y_prediction = logreg.predict(X_test_dtm)

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)

In [26]:
df_result = pd.DataFrame(columns=['text','label','prediction'])
df_result['text'] = x_test
df_result['label'] = Y_test
df_result['prediction'] = Y_prediction

In [27]:
df_result[df_result['label']!=df_result['prediction']][20:50]

df1=df_result[df_result['label']!=df_result['prediction']][20:50]
for x in df1['text']:
    print(x)

Unnamed: 0,text,label,prediction
50,everybody chill sony may separate global flags...,1,2
56,sony announced new 500 gb playstation 4 bundle...,0,1
59,could hit 3rd console war wrote characters wen...,1,2
60,sun daily lenovo sony marshall london on-trend...,0,1
61,one voted 5th popular song time poll conducted...,0,1
62,that's i think may weird last minute deal sony...,0,1
66,sony 1st 1 film summer war room fought compton...,0,1
68,uncharted march may thing bring back sony,0,1
71,ok ... sony announced playable games line toky...,0,1
76,i got love sony they disappear tomorrow someon...,2,0


everybody chill sony may separate global flagship phone works unfortunately
sony announced new 500 gb playstation 4 bundle uncharted 4 stored october 9 400 us 450 canada
could hit 3rd console war wrote characters went sony's side
sun daily lenovo sony marshall london on-trend smartphones some smartphone trends come go ...
one voted 5th popular song time poll conducted sony
that's i think may weird last minute deal sony mirrors edge solid 12 tv's
sony 1st 1 film summer war room fought compton 9.3 m 8.8 m
uncharted march may thing bring back sony
ok ... sony announced playable games line tokyo games show 2015 september 15 interestingly ...
i got love sony they disappear tomorrow someone else would replace try harder consumers
sony pictures last monday sony pictures released first theatrical trailer concussion studio's ...
sony's project morpheus review a virtual reality headset may change future gaming ...
for info would need contact sony apologies inconvenience may caused ai
tomorrow mo

For example, for the first sentence, the model predicts it as negative, while the sentence is neutral actually. I think this is because the sentence contains the word 'unfortunately'. For the second sentence, I personally think it should be neutral without looking at the label, but it is actually positive. 

### Q3 What sorts of phenomena do you see and speculate on why you see these errors. 

We can see that many positive/negative tweets are mis-classified to neutral one. From the result of logistic regression, we can see the `P|(precision of neutral)` is low compared to `P+(precision of positive)`. Since precision is fraction of relevant instances among the retrieved instances, it means many non-neutral tweets are classified as neutral.

### Q4 Are there distinct differences between classifiers or are differences difficult to see from your results?

In [28]:
performance(Vader_Score1, Naive_Bayes_Score1, Log_Score1)

+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+
|  Classifier | F1+  | F1|  | F1-  | F1_avg |  R+  |  R|  |  R-  | R_avg |  P+  |  P|  |  P-  | P_avg |
+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+
|    Vader    | 0.45 | 0.45 | 0.58 |  0.5   | 0.57 | 0.36 | 0.66 |  0.53 | 0.37 | 0.6  | 0.52 |  0.5  |
| Naive Bayes | 0.62 | 0.43 | 0.45 |  0.5   | 0.57 | 0.46 | 0.53 |  0.52 | 0.69 | 0.41 | 0.4  |  0.5  |
|   Logistic  | 0.65 | 0.48 | 0.45 |  0.53  | 0.6  | 0.51 | 0.49 |  0.54 | 0.7  | 0.45 | 0.42 |  0.53 |
+-------------+------+------+------+--------+------+------+------+-------+------+------+------+-------+


As the result table shows, we can see that there is a great difference between the result of Vader and the results of Naive Bayes, Logistic regression. Vader performs better in $R-$ and $P|$ while other two methods perform better in $R+$ and $P+$. It may because Vader calculates the scores by adding up the score for each word, while the other two methods try to learn the features from the observations.

### Q5 How important was tokenization / feature extraction?

It helps improve accuracy when doing sentiment analysis. For example, excluding URL will decrease the noise in the sentence because it is only a website link and cannot represent any meaning.

### Q6 If you had more time, what might you do differently?

* We would use cross validation to choose exact optimal parameters for the model. 
* The input data contains emoji but our training data does not contain emoji. We can find some data that includes emoji, so we can train the model with emoji.
* We can add more words in Vader lexicon by looking at the mis-classification's score and figuring out the reason.

### Q7 What questions do you now have about your analysis that you didn't have before starting?

* How punctuation, hashtag, profile influence the result remains unsolved since the result of each experiment seems the same?

* Why Naive bayes and logistic regression did not perform as well as we expected?

* Is there any other model that has high accuracy for us to do sentiment analysis?

## Predict Input Data

In [29]:
# Import data without Emoji
with open('./data/Dev/INPUT.txt', encoding="utf-8") as file:
    final=[]
    user_id=[]
    for line in file:
        tweet=line.split('\t')[2] 
        tweet=tweet.encode('ascii', 'ignore').decode('ascii')
        user_id.append(line.split('\t')[0])
        final.append(tweet[:-1])

In [30]:
# Preprocess the data using the same procedure
final_hash, final_clean, final_punc = process_tweets(final)

In [31]:
def Predict_Label(X_train_dtm, Y_train, X_test_dtm, c, n):   
    logreg = LogisticRegression(C=c, max_iter =n, class_weight='balanced', solver='newton-cg')
    logreg.fit(X_train_dtm, Y_train)
    Y_prediction = logreg.predict(X_test_dtm)
    
    return list(Y_prediction)

In [32]:
# Predict label using logistic model
model_x = train_clean+dev_clean+test_clean
model_y = train_y+ dev_y+test_y
x_test = final_clean[:]

# Convert to dataframe
df_dict = {'text':x_test}
df = pd.DataFrame(df_dict)
X_test = df['text']

X_train, Y_train=convert_label(model_x, model_y)
X_train_dtm, X_test_dtm=vectorizer(X_train, X_test)

label=Predict_Label(X_train_dtm, Y_train, X_test_dtm, c=0.1, n=100)

In [33]:
# Convert output to dataframe
output = {'id': user_id,'label':label}
df = pd.DataFrame(output)
    
# Convert label
df['label'] = df['label'].map({0:'positive', 1:'neutral', 2:'negative'})

In [34]:
# Write out to tab-separated csv
path='./data/Dev/Output.csv'
df.to_csv(path, index=False, sep='\t')