In [1]:
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import re

import nltk
from nltk import word_tokenize
from nltk.corpus import wordnet # pos
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from sklearn.feature_extraction import _stop_words as stop_words

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

from functions import *

### Dataset

In [2]:
# hand-labeled dataset. 
# Cleaned (not duplicates). 
# Resized to have the same size for all three classes
data = []
data_labels = []
with open("Data/neg_u_13k.csv", encoding="utf8") as f:
  for i in f: 
    data.append(cleanup(eval(i).decode()))  # we don eval/decode because each line is a string of a binary string
    data_labels.append('neg')
with open("Data/pos_u_13k.csv", encoding="utf8") as f:
  for i in f: 
    data.append(cleanup(eval(i).decode()))
    data_labels.append('pos')
with open("Data/neu_u_13k.csv", encoding="utf8") as f:
  for i in f: 
    data.append(cleanup(eval(i).decode())) 
    data_labels.append('neu')

In [11]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Simple Rule_based models

In [3]:
rule_pos = set()
rule_neg = set()
with open("Data/rule_neg.txt", encoding="utf8") as f:
  for i in f: 
    rule_neg.add(i.rstrip('\n').lower())
with open("Data/rule_pos.txt", encoding="utf8") as f:
  for i in f: 
    rule_pos.add(i.rstrip('\n').lower())

def label(string):
  pos_count, neg_count = 0, 0
  for i in string.split():
    if i.lower() in rule_pos:
      pos_count += 1
    if i.lower() in rule_neg:
      neg_count += 1
  if pos_count == neg_count == 0:
    return 'neu'
  if pos_count > neg_count:
    return 'pos'
  return 'neg'

correct_count = 0
for index, tweet in enumerate(data):
  if label(tweet) == data_labels[index]:
    correct_count += 1
    
print(round( (correct_count/len(data) * 100), 2))

#TODO: Apply stemming (lover, loving, loved, etc all should be turned into love)

38.92


### Baseline Model

In [6]:
def train(data, data_labels, vectorizer_params={}, model_params={}, model=None, verbose=False):  # for Naive Bayes: model = MultinomialNB()
  
  vectorizer = CountVectorizer(**vectorizer_params)
  if verbose:
    print(vectorizer)
  
  # warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
  # warnings.filterwarnings("ignore", category=DeprecationWarning)
  # warnings.filterwarnings("ignore", category=FutureWarning)
  # warnings.filterwarnings("ignore", category=ConvergenceWarning)
  features = vectorizer.fit_transform(data)
  if verbose:
    print('Number of Features: ', len(vectorizer.get_feature_names()))
  
  X_train, X_val, y_train, y_val  = train_test_split(features, data_labels, test_size=0.2, random_state=42)
  
  if not model:
    model = LogisticRegression(**model_params, solver='liblinear')
    
  model.fit(X=X_train, y=y_train)
  y_pred = model.predict(X_val)

  return round(accuracy_score(y_val, y_pred) * 100, 2)

In [7]:
train(data, data_labels)

59.21

### Removing Emojis

In [20]:
data_no_animate_emoji = convert_animated_emojis(data)
data_no_emoji = convert_text_emoji(data_no_animate_emoji)

In [8]:
model = LogisticRegression(solver = 'liblinear', max_iter= 1000, random_state=0)

In [59]:
train(data_no_emoji, data_labels, model=model)

0.633

### Compressing

In [37]:
compresed_data = compress(data_no_emoji)

In [53]:
train(compresed_data, data_labels, model=model)

0.634

### StopWords

In [38]:
from sklearn.feature_extraction import stop_words
scikit_stopwords = stop_words.ENGLISH_STOP_WORDS

print(train(data, vectorizer_params={'stop_words':scikit_stopwords}, model=model))
print(train(data_no_emoji, vectorizer_params={'stop_words':scikit_stopwords}, model=model))

0.582
0.627


In [40]:
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')

print(train(data, vectorizer_params={'stop_words':nltk_stopwords}, model=model))
print(train(data_no_emoji, vectorizer_params={'stop_words':nltk_stopwords}, model=model))

Exception ignored in: <_io.FileIO name='C:\\Users\\vvaezian.NET\\AppData\\Roaming\\nltk_data\\corpora\\stopwords\\english' mode='rb' closefd=True>


0.584
0.628


In [None]:
nltk_stopwords_cleaned=["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

In [61]:
tokens = get_tokens(data)
len(tokens)

472997

In [57]:
Counter(tokens).most_common(200)

[('the', 10212),
 ('to', 9284),
 ('you', 8270),
 ('and', 5463),
 ('brt', 4935),
 ('in', 4521),
 ('is', 4505),
 ('my', 4406),
 ('of', 4270),
 ('for', 4116),
 ('rt', 4041),
 ('me', 3638),
 ('it', 3461),
 ('on', 3089),
 ('that', 2942),
 ('im', 2850),
 ('so', 2774),
 ('this', 2648),
 ('xf0x9fx98x82', 2645),
 ('be', 2549),
 ('xe2x80xa6', 2211),
 ('with', 2162),
 ('your', 2099),
 ('like', 2053),
 ('have', 2023),
 ('just', 1887),
 ('but', 1855),
 ('at', 1853),
 ('love', 1840),
 ('its', 1823),
 ('not', 1772),
 ('are', 1768),
 ('dont', 1751),
 ('if', 1586),
 ('all', 1485),
 ('we', 1413),
 ('xe2x80x9c', 1412),
 ('when', 1394),
 ('get', 1388),
 ('xe2x80x9d', 1378),
 ('can', 1360),
 ('do', 1359),
 ('was', 1358),
 ('up', 1337),
 ('what', 1260),
 ('out', 1240),
 ('xefxb8x8f', 1218),
 ('bi', 1207),
 ('no', 1145),
 ('from', 1112),
 ('know', 1102),
 ('about', 1067),
 ('will', 1062),
 ('they', 1060),
 ('one', 1031),
 ('good', 1021),
 ('people', 1001),
 ('as', 984),
 ('follow', 951),
 ('how', 936),
 ('go

In [12]:
show_emoji('xf0x9fx98x82')

'😂'

### Visualize and find best accuracy for multiple inputs

In [None]:
X = range(1000, 4001, 200)
Y = []
best_acc = 0
for n in X:
  data = list(set(most_common(pos_lem_stem + neg_lem_stem, n)))
  acc = train(data)
  if acc > best_acc:
    best_acc = acc
  Y.append(acc) 

print('Best Accuracy: ', best_acc)
plt.plot(X, Y)
plt.show()

### Lematization and Stemming

In [None]:
class LemmaTokenizer(object):
  def __init__(self):
    self.wnl = WordNetLemmatizer()
  def __call__(self, articles):
    #a = [self.wnl.lemmatize(compress(t)) for t in word_tokenize(articles) if '/' not in t and len(t) >= 2 and t.isalpha()]
    #a = [self.wnl.lemmatize(compress(t), map_pos(nltk.pos_tag(t)[0][1])) for t in word_tokenize(articles) if '/' not in t and t.isalpha()]
    a = [self.wnl.lemmatize(t, map_pos(nltk.pos_tag(t)[0][1])) for t in word_tokenize(articles) if '/' not in t and t.isalpha()]
    return [i for i in a if len(i) > 1]

lemmatizer = WordNetLemmatizer()
stemmer = nltk.stem.SnowballStemmer('english')

### Cleaning Data

In [None]:
posStr = pos.lower() 
negStr = neg.lower()

posStr2 = posStr.replace("\'", '')
negStr2 = negStr.replace("\'", '')

posStr = pos.lower().replace("n\'t", ' not')
negStr = neg.lower().replace("n\'t", ' not')

pos_cleaned = re.findall("[a-z][a-z]+", posStr2)
neg_cleaned = re.findall("[a-z][a-z]+", negStr2)

pos_lem_stem = [ lemmatizer.lemmatize(i) for i in pos_cleaned ]
neg_lem_stem = [ lemmatizer.lemmatize(i) for i in neg_cleaned ]

### Finding Misclassified Inputs

In [None]:
misclassified_samples_pos = X_val[(y_val != y_pred) & (y_pred == 'pos')]
misclassified_samples_neg = X_val[(y_val != y_pred) & (y_pred == 'neg')]
res = vectorizer.inverse_transform(misclassified_samples_neg)

In [10]:
#model = LogisticRegression(**model_params, solver = 'liblinear', max_iter= 1000, random_state=0)
model = LogisticRegression(solver = 'liblinear', max_iter= 1000, random_state=0)

# for i in range(1000, 10001, 500):
#   vocab = [i[0] for i in Counter(tokens).most_common(i)]
#   train(data2_string_per_line, {'vocabulary':vocab})  # best 73.1

# black_list = ['xe2x80xa6','xe2x80x9c', 'xe2x80x9d', 'xefxb8x8f', 'xe2x80x99']

# for i in range(1000, 7501, 100):
#   vocab = [i[0] for i in Counter(tokens).most_common(i)]
#   print(i, train(data2_string_per_line, {'vocabulary':vocab}, model=model))  # best 73.1

0.635
