In [22]:
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import re

import nltk
from nltk import word_tokenize
from nltk.corpus import wordnet # pos
from nltk.corpus import stopwords
from sklearn.feature_extraction import _stop_words as stop_words

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

from functions import *

In [2]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Dataset

In [23]:
# hand-labeled dataset. 
# Cleaned (not duplicates). 
# Resized to have the same size for all three classes
data = []
data_labels = []
with open("Data/neg_u_13k.csv", encoding="utf8") as f:
  for i in f: 
    #data.append(eval(i).decode())  # we don eval/decode because each line is a string of a binary string
    #data.append(i.replace("b'", '').replace('b"', '').replace('"\n', '').replace("'\n", ''))
    data.append(i)
    data_labels.append('neg')
with open("Data/pos_u_13k.csv", encoding="utf8") as f:
  for i in f: 
    data.append(i)
    data_labels.append('pos')
with open("Data/neu_u_13k.csv", encoding="utf8") as f:
  for i in f: 
    data.append(i)
    data_labels.append('neu')


import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_colwidth', -1)

df = pd.DataFrame(zip(data, data_labels), columns=['tweet', 'sentiment'])
non_test, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(non_test, test_size=0.25, random_state=42)

test.reset_index(drop=True, inplace=True)

In [24]:
df.iloc[:10]

Unnamed: 0,tweet,sentiment
0,b'- THEN let it fuck u in the ass'\n,neg
1,"b'!!!!!!!!!!!!!!!!!!!! @ms_fabdee: For the love of God, please smell nice.""""'\n",neg
2,"b'"" 6 children, 2 adults dead in Florida shooting: BELL, Fla. (AP) \xe2\x80\x94 A once-convicted felon killed six ... http://t.co/BZw4fdoaZz #science'\n",neg
3,"b'"" Hopes for the future..? Yes. However , getting there with you...is not much different than being alone..!""'\n",neg
4,"b'"" If you don\'t have anything nice to say, don\'t say anything at all.""'\n",neg
5,"b""- isn't there, or that his beard looks weird, or that this whole fucking situation is completely odd!""\n",neg
6,"b""- Like don't Nobody Want Me Frfr ,""\n",neg
7,"b'"" Love,you hurt my heart ""'\n",neg
8,"b'"" Mfs stay feeling some type about shit I post on MY twitter ..Like if it bothers you so bad pray to Jesus about it maybe he\'ll fix it \xf0\x9f\x98\xb9\xf0\x9f\x91\x8c""'\n",neg
9,"b'"" Mr Cereal lover, I wish your mother loved you like I would of that way you could of known how to love a woman "" \xf0\x9f\x91\x8f\xf0\x9f\x99\x8c'\n",neg


### Baselines

##### Random Selection Model (33%)
A random classification model would have around 33% accuracy as there are three classes and they are balanced.

##### Simple Rule-based (38%)

In [6]:
rule_pos = set()
rule_neg = set()
with open("Data/rule_neg.txt", encoding="utf8") as f:
  for i in f: 
    rule_neg.add(i.rstrip('\n').lower())
with open("Data/rule_pos.txt", encoding="utf8") as f:
  for i in f: 
    rule_pos.add(i.rstrip('\n').lower())

def label(string):
  pos_count, neg_count = 0, 0
  for i in string.split():
    if i.lower() in rule_pos:
      pos_count += 1
    if i.lower() in rule_neg:
      neg_count += 1
  if pos_count == neg_count:
    if pos_count == 0:
      return 'neu'
    else: 
      return 'neg'
  if pos_count > neg_count:
    return 'pos'
  return 'neg'

correct_count = 0
for index, (tweet, sentiment) in df.iterrows():
  if label(tweet) == sentiment:
    correct_count += 1
    
print(round( (correct_count/len(df) * 100), 2))

38.61


##### NLTK API (48%)

In [188]:
n = 1000

import requests
def nltk_label(text):
  res = requests.post('http://text-processing.com/api/sentiment/', data={'text':text})
  sentiment = eval(res.text)['label']
  return sentiment if sentiment != 'neutral' else 'neu'

correct_count = 0
for index, (tweet, sentiment) in test.iterrows():
  if index >= n:  # because of throttle
    break
  try:
    if nltk_label(tweet) == sentiment:
      correct_count += 1
  except Exception as e:
    print(e)
    print(tweet)
    print(round( (correct_count/index * 100), 2))
    break
  
    
print(round( (correct_count/n * 100), 2))

47.9


##### Initial ML Model without any improvements (59.82%)

In [138]:
def run_model(data, data_labels, vectorizer_params={}, model_params={}, model=None, verbose=False):  # for Naive Bayes: model = MultinomialNB()
  
  vectorizer = CountVectorizer(**vectorizer_params)
  features = vectorizer.fit_transform(data)
  feature_names = vectorizer.get_feature_names()

  X_train, X_val, y_train, y_val  = train_test_split(features, data_labels, test_size=0.2, random_state=42)
  
  if not model:
    model = LogisticRegression(**model_params, solver='newton-cg')
  
  if verbose:
    #print(vectorizer)
    print('Number of Features: ', len(feature_names))
    #print('model: ', model)
    
  model.fit(X=X_train, y=y_train)
  y_pred = model.predict(X_val)
  if verbose:
    print('Accuracy: {}%'.format(round(accuracy_score(y_val, y_pred) * 100, 2)))
  return feature_names

In [165]:
feature_names = run_model(train['tweet'], train['sentiment'], verbose=True)

Number of Features:  47531
Accuracy: 59.82%


In [168]:
tokens = get_tokens(train['tweet'])
len(tokens)

283738

### Improvements

##### Simple Cleanup (60.39%)

In [141]:
def cleanup(text):
  if text[:2] in ('b"', "b'"):
    text = text[2:]
  # text = text.replace('_', ' ')
  # remove_digits = str.maketrans('', '', '012456789')
  # text = text.translate(remove_digits)

  tokens = text.split()
  filtered_tokens = ' '.join([ i for i in tokens if not i.startswith('http') and not i.startswith('@') ])
  return filtered_tokens

train_cleaned = train.copy()
train_cleaned['tweet'] = train['tweet'].apply(cleanup)
feature_names2 = run_model(train_cleaned['tweet'], train_cleaned['sentiment'], verbose=True)

Number of Features:  27472
Accuracy: 60.39%


##### Adding StopWords (didn't improve)

In [147]:
# sklearn stopwords (default)
print('*** sklearn stopwords')
feature_names = run_model(train_cleaned['tweet'], train_cleaned['sentiment'], vectorizer_params={'stop_words':'english'}, verbose=True)

# sklearn stopwords (default) cleaned
from sklearn.feature_extraction import _stop_words as stop_words
scikit_stopwords = stop_words.ENGLISH_STOP_WORDS
scikit_stopwords_cleaned = set(scikit_stopwords)
# removing the following words from the list of stopwords as they could be helpful in determining sentiment
for member in ['empty', 'less', 'too', 'alone', 'never', 'enough', 'can', 'everything', 'give', 'serious', 
               'will', 'always', 'couldnt', 'nobody', 'must', 'sincere', 'cant', 'down', 'cannot', 'cry', 
               'full', 'neither', 'nowhere', 'anything', 'nor', 'nothing', 'not', 'please', 
               'last', 'behind', 'out', 'every']:
  scikit_stopwords_cleaned.remove(member)
print('*** sklearn stopwords cleaned')
feature_names = run_model(train_cleaned['tweet'], train_cleaned['sentiment'], vectorizer_params={'stop_words':scikit_stopwords_cleaned}, verbose=True)

# nltk stopwords
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')
print('*** nltk stopwords')
feature_names = run_model(train_cleaned['tweet'], train_cleaned['sentiment'], vectorizer_params={'stop_words':nltk_stopwords}, verbose=True)

# nltk stopwords cleaned
nltk_stopwords_cleaned=["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "between", "into", "through", "during", "before", "after", "to", "from", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "more", "most", "other", "some", "such", "nor", "only", "own", "same", "so", "than", "very", "s", "t", "just", "don", "should", "now"]
print('*** nltk stopwords cleaned')
feature_names = run_model(train_cleaned['tweet'], train_cleaned['sentiment'], vectorizer_params={'stop_words':nltk_stopwords_cleaned}, verbose=True)

# the following is taken from the 100 most common tokens in the data that seems to have no sentiment
custom_stopwords = {'I', 'the', 'RT', 'to', 'a', 'you', 'and', 'in', 'is', 'of', 'my', 'for', 'me', 'on', 'it'
  , 'that', "I'm", 'be', 'so', 'this', 'with', 'your', 'have', 'at', 'just', 'are', 'but', 'i' 
  , 'was', 'all', 'get', 'up', 'do', 'when', '&amp;', '-', 'from', 'if', 'know', 'we', 'about', 'what'
  , 'The', "it's", 'as', 'they', 'one', 'by', 'no', 'see', 'go', 'You', 'how'
  , 'or', 'an', 'got', 'who', 'he', 'more', 'day', 'make', "you're", 'My', 'A', 'has', 'really', 'now', 'some'
  , "It's", 'back', 'would', 'going', 'his', 'been', 'new', 'only', 'her', 'even'
  , "I've", 'them', 'than', 'our', "that's", 'This', 'had', 'there', 'then', 'say', 'come', 'their', 'am'
}

print('*** custome stopwords')
feature_names = run_model(train_cleaned['tweet'], train_cleaned['sentiment'], vectorizer_params={'stop_words':custom_stopwords}, verbose=True)

*** sklearn stopwords
Number of Features:  27179
Accuracy: 59.31%
*** sklearn stopwords cleaned
Number of Features:  27211
Accuracy: 59.89%
*** nltk stopwords
Number of Features:  27331
Accuracy: 59.8%
*** nltk stopwords cleaned
Number of Features:  27361
Accuracy: 60.27%
*** custome stopwords




Number of Features:  27398
Accuracy: 59.93%


##### Stemming and Lemmatization (didn't improve)

In [155]:
lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = nltk.stem.SnowballStemmer('english')

for word in ['studies', 'studied', 'studying', 'loves', 'loved', 'came', 'test_this']:
  print(word, lemmatizer.lemmatize(word), stemmer.stem(word))

studies study studi
studied studied studi
studying studying studi
loves love love
loved loved love
came came came
test_this test_this test_thi


In [156]:
def cleanup2(text):
  return text.replace('_', ' ')
  
train_cleaned_stemmed = train_cleaned.copy()
#train_cleaned_stemmed = train_cleaned_stemmed.apply(cleanup2)
train_cleaned_stemmed['tweet'] = train_cleaned['tweet'].apply(stemmer.stem)
feature_names3 = run_model(train_cleaned_stemmed['tweet'], train_cleaned_stemmed['sentiment'], verbose=True)

Number of Features:  27960
Accuracy: 59.99%


In [164]:
train_cleaned_stemmed = train_cleaned.copy()
train_cleaned_stemmed['tweet'] = train_cleaned['tweet'].apply(cleanup2).apply(stemmer.stem)
feature_names3 = run_model(train_cleaned_stemmed['tweet'], train_cleaned_stemmed['sentiment'], verbose=True)

Number of Features:  27956
Accuracy: 60.03%


In [200]:
train_cleaned_stemmed[:10]

Unnamed: 0,tweet,sentiment
24773,rt me in a relationship \xf0\x9f\x98\xa1\xf0\x9f\x98\x8d,pos
3831,"that's ugly. \xf0\x9f\x98\xb4""",neg
36171,smooth criminal? nvm...lol,neu
17012,can you please follow me i love you so much \xf0\x9f\x8c\xba\xf0\x9f\x8c\xba\xf0\x9f\x8c\xba\xf0\x9f\x8c\xba \xe2\x9d\xa4\xef\xb8\x8f\xe2\x9d\xa4\xef\xb8\x8f\xe2\x9d\xa4\xef\xb8\x8f\xe2\x9d\xa4\xef\xb8\x8f\xf0\x9f\x92\x80\xf0\x9f\x92\x80\xf0\x9f\x92\x80hhaa,pos
33383,cc just came up with name listen to your kid,neu
38999,"gadget: after $2m in pre-orders, osmo starts shipping its hardware-based ipad game for kids, rolls out customizati...",neu
20504,blessed to see another day.,pos
22050,i liked a video how the amazing spider-man should have ended - bonus scen,pos
4307,. has been unbelievable 2nd half #efc,neg
23350,one person followed me in the last day thanks to app,pos


##### Handling Emojis
https://unicode.org/Public/emoji/15.0/emoji-test.txt
- We need to make every emoji a new token (e.g. 'test😡😍' is counted as one token, but it should be three tokens)
- If we categorize emojis into groups and use group name, that like would improve the accuracy

In [45]:
a = b'this is a test.\xf0\x9f\x98\xa1\xf0\x9f\x98\x8d'
print(len(a))
print(a.decode())

23
this is a test.😡😍


In [46]:
a.decode().split()

['this', 'is', 'a', 'test.😡😍']

In [71]:
t = b'\xf0\x9f\x98\xa1\xf0\x9f\x98\x8d'
for i in t.decode():
  print(unicodedata.category(i))

So
So


In [27]:
df['tweet'].iloc[:10]

0    b'- THEN let it fuck u in the ass'\n                                                                                                                                          
1    b'!!!!!!!!!!!!!!!!!!!! @ms_fabdee: For the love of God, please smell nice.""'\n                                                                                               
2    b'" 6 children, 2 adults dead in Florida shooting: BELL, Fla. (AP) \xe2\x80\x94 A once-convicted felon killed six ... http://t.co/BZw4fdoaZz #science'\n                      
3    b'" Hopes for the future..? Yes. However , getting there with you...is not much different than being alone..!"'\n                                                             
4    b'" If you don\'t have anything nice to say, don\'t say anything at all."'\n                                                                                                  
5    b"- isn't there, or that his beard looks weird, or that this whole fucking situation is complet

In [54]:
df2 = df['tweet'].iloc[8:10].map(lambda x: eval(x).decode())
df2

8    " Mfs stay feeling some type about shit I post on MY twitter ..Like if it bothers you so bad pray to Jesus about it maybe he'll fix it 😹👌"
9    " Mr Cereal lover, I wish your mother loved you like I would of that way you could of known how to love a woman " 👏🙌                      
Name: tweet, dtype: object

In [55]:
tokens = get_tokens(df2)
len(tokens)

57

In [57]:
' '.join(df2)

'" Mfs stay feeling some type about shit I post on MY twitter ..Like if it bothers you so bad pray to Jesus about it maybe he\'ll fix it 😹👌" " Mr Cereal lover, I wish your mother loved you like I would of that way you could of known how to love a woman " 👏🙌'

In [56]:
Counter(tokens).most_common()

[('"', 3),
 ('I', 3),
 ('it', 3),
 ('you', 3),
 ('about', 2),
 ('to', 2),
 ('of', 2),
 ('Mfs', 1),
 ('stay', 1),
 ('feeling', 1),
 ('some', 1),
 ('type', 1),
 ('shit', 1),
 ('post', 1),
 ('on', 1),
 ('MY', 1),
 ('twitter', 1),
 ('..Like', 1),
 ('if', 1),
 ('bothers', 1),
 ('so', 1),
 ('bad', 1),
 ('pray', 1),
 ('Jesus', 1),
 ('maybe', 1),
 ("he'll", 1),
 ('fix', 1),
 ('😹👌"', 1),
 ('Mr', 1),
 ('Cereal', 1),
 ('lover,', 1),
 ('wish', 1),
 ('your', 1),
 ('mother', 1),
 ('loved', 1),
 ('like', 1),
 ('would', 1),
 ('that', 1),
 ('way', 1),
 ('could', 1),
 ('known', 1),
 ('how', 1),
 ('love', 1),
 ('a', 1),
 ('woman', 1),
 ('👏🙌', 1)]

In [77]:
import emoji
emoji.demojize('test.😹👌')

'test.:cat_with_tears_of_joy::OK_hand:'

In [20]:
data_no_animate_emoji = convert_animated_emojis(data)
data_no_emoji = convert_text_emoji(data_no_animate_emoji)

In [8]:
model = LogisticRegression(solver = 'liblinear', max_iter= 1000, random_state=0)

In [59]:
run_model(data_no_emoji, data_labels, model=model)

0.633

##### Handling Emoticos
- In countVectorizer -> token_pattern param, it says "punctuation is completely ignored and always treated as a token separator". The messes up with emoticons (e.g. ":-)").

### Compressing

In [37]:
compresed_data = compress(data_no_emoji)

In [53]:
run_model(compresed_data, data_labels, model=model)

0.634

In [201]:
show_emoji('xf0x9fx98x82')

'😂'

In [25]:
u"\U0001f63b"

'😻'

### Visualize and find best accuracy for multiple inputs

In [None]:
X = range(1000, 4001, 200)
Y = []
best_acc = 0
for n in X:
  data = list(set(most_common(pos_lem_stem + neg_lem_stem, n)))
  acc = run_model(data)
  if acc > best_acc:
    best_acc = acc
  Y.append(acc) 

print('Best Accuracy: ', best_acc)
plt.plot(X, Y)
plt.show()

### Lematization and Stemming

### Cleaning Data

In [None]:
posStr = pos.lower() 
negStr = neg.lower()

posStr2 = posStr.replace("\'", '')
negStr2 = negStr.replace("\'", '')

posStr = pos.lower().replace("n\'t", ' not')
negStr = neg.lower().replace("n\'t", ' not')

pos_cleaned = re.findall("[a-z][a-z]+", posStr2)
neg_cleaned = re.findall("[a-z][a-z]+", negStr2)

pos_lem_stem = [ lemmatizer.lemmatize(i) for i in pos_cleaned ]
neg_lem_stem = [ lemmatizer.lemmatize(i) for i in neg_cleaned ]

### Finding Misclassified Inputs

In [None]:
misclassified_samples_pos = X_val[(y_val != y_pred) & (y_pred == 'pos')]
misclassified_samples_neg = X_val[(y_val != y_pred) & (y_pred == 'neg')]
res = vectorizer.inverse_transform(misclassified_samples_neg)

In [10]:
#model = LogisticRegression(**model_params, solver = 'liblinear', max_iter= 1000, random_state=0)
model = LogisticRegression(solver = 'liblinear', max_iter= 1000, random_state=0)

# for i in range(1000, 10001, 500):
#   vocab = [i[0] for i in Counter(tokens).most_common(i)]
#   run_model(data2_string_per_line, {'vocabulary':vocab})  # best 73.1

# black_list = ['xe2x80xa6','xe2x80x9c', 'xe2x80x9d', 'xefxb8x8f', 'xe2x80x99']

# for i in range(1000, 7501, 100):
#   vocab = [i[0] for i in Counter(tokens).most_common(i)]
#   print(i, run_model(data2_string_per_line, {'vocabulary':vocab}, model=model))  # best 73.1

0.635
