In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
### Text Pre-processing
import re, string
punctuations = string.punctuation + "…"
import nltk
from nltk.tokenize import word_tokenize # , sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Create Nicknames Dictionary from All-star/Well-noted Player Lists

In [5]:
namelist = pd.read_csv("namelist_new.csv")
nicknames = pd.read_csv("nicknames_new.csv")

In [6]:
namelist.columns = ["Names", "Share"]
print(namelist.shape)
namelist.head(2)

(32, 2)


Unnamed: 0,Names,Share
0,James Harden,0.373011
1,LeBron James,0.365153


In [7]:
print(nicknames.shape)
nicknames.head(2)

(32, 7)


Unnamed: 0,Player Name,Nickname 1,Nickname 2,Nickname 3,Nickname 4,Nickname 5,Nickname 6
0,James Harden,JamesHarden,JHarden13,Harden,Beard,,
1,LeBron James,LeBronJames,KingJames,LeBron,LBJ,King James,chosen one


In [10]:
for column in nicknames.columns[1:]:
    nicknames[column] = nicknames[column].apply(lambda x: "".join(x.split()).lower() )

In [11]:
nicknames = nicknames.fillna("")

In [14]:
nicknames_dict = {}
for i in range(len(namelist)):
    names = []
    names.append(namelist.loc[i]["Names"].lower().split()[-1])
    if namelist.loc[i]["Names"] in nicknames["Player Name"].values:
        row = nicknames[nicknames["Player Name"] == namelist.loc[i]["Names"]]
        for j in range(1, len(row.columns)):
            if (row.iloc[0,j] != "") and (row.iloc[0,j] != "nan"):
                names.append(row.iloc[0,j])
    nicknames_dict.update({namelist.loc[i]["Names"]: names})

In [15]:
print(len(namelist))
print(len(nicknames_dict.keys()))

32
32


In [16]:
nicknames_dict

{'James Harden': ['harden', 'jamesharden', 'jharden13', 'harden', 'beard'],
 'LeBron James': ['james',
  'lebronjames',
  'kingjames',
  'lebron',
  'lbj',
  'kingjames',
  'chosenone'],
 'Giannis Antetokounmpo': ['antetokounmpo',
  'giannisantetokounmpo',
  'giannis_an34',
  'giannis',
  'greekfreak',
  'greekfreak'],
 'Anthony Davis': ['davis',
  'anthonydavis',
  'antdavis23',
  'ad',
  'thebrow',
  'davis'],
 'Luka Doncic': ['doncic', 'lukadoncic', 'luka7doncic', 'luka', 'doncic'],
 'Trae Young': ['young', 'traeyoung', 'thetraeyoung', 'trae'],
 'Nikola Jokic': ['jokic',
  'nikolajokic',
  'jokic',
  'bighoney',
  'thejoker',
  'bighoney'],
 'Russell Westbrook': ['westbrook',
  'russellwestbrook',
  'russwest44',
  'russ',
  'westbrook',
  'brodie',
  'beastbrook'],
 'Kawhi Leonard': ['leonard', 'kawhileonard', 'kawhileonard', 'kawhi', 'claw'],
 'Chris Paul': ['paul', 'chrispaul', 'cp3', 'cp3'],
 'Devin Booker': ['booker', 'devinbooker', 'devinbook', 'booker', 'devinbook'],
 'Khris 

In [17]:
players = list(nicknames_dict.keys())

# Text Pre-processing

In [120]:
import spacy
import en_core_web_lg

nlp = en_core_web_lg.load()

In [121]:
player_handles  = ['@jharden13', '@kingjames', '@giannis_an34', '@antdavis23', '@luka7doncic', '@thetraeyoung', 'jokic', '@russwest44', '@kawhileonard', '@cp3', '@devinbook', '@khris22m', '@bam1of1', '@jaytatum0', '@kembawalker', '@jimmybutler', '@bensimmons25', '@joelembiid', '@dsabonis11', '@b_ingram13', '@pskills43', '@rudygobert27', '@spidadmitchell', '@klow7', '@dame_lillard', '@demar_derozan', '@isaiahthomas', '@johnwall', '@aldridge_12', '@yg_trece', '@stephencurry30', '@vicoladipo']

def spacy_cleaner(text):
    text = text.lower()
    #nlp tokenize it 
    parsed = nlp(text)
    final_tokens = []
    for t in parsed:
        #if it is a player handle, remove the @ mention
        if str(t) in player_handles: 
            final_tokens.append(str(t)[1:])
        #if number, stopwords, space, url, @ mention of other users, do not keep it. Keep stop words (or t.is_stop)
        elif t.is_punct or t.is_space or t.like_num or t.like_url:
            pass
        #if words or likely to be words
        else:
            #if no lemmatization available, do nothing
            if t.lemma_ == '-PRON-':
                final_tokens.append(str(t))
            else:
                #Otherwise remove any none letter in the string, and replace with the root word
                sc_removed = re.sub("[^a-zA-Z]", '', str(t.lemma_))
                #if the rootword is a word, keep it. if not, drop it. Removes none words, links
                if len(sc_removed) > 1:
                    final_tokens.append(sc_removed)

    joined = ' '.join(final_tokens)
    #replace spelling correction e.g. replace happpppyyyyy with happyy
    spell_corrected = re.sub(r'(.)\1+', r'\1\1', joined)
    return spell_corrected

In [None]:
data1 = pd.read_csv("All_Tweets_P1_Carmen.csv")
data2 = pd.read_csv("All_Tweets_P2_Vincent.csv")
data3 = pd.read_csv("All_Tweets_P3_Jing.csv")
data = pd.concat([data1, data2, data3])

In [None]:
# Brief description of dataframe
print(data.columns)
print(data.dtypes.value_counts())
print("'%s' data shape is " % (data.loc[0,"username"]), data.shape)

In [None]:
df = data[["user_id", "username", "date", "time", "tweet"]]
print(df.shape)
df.head(2)

In [None]:
df["text"] = df["tweet"].dropna().apply(lambda txt: spacy_cleaner(txt))

In [None]:
df.to_csv("All_Tweets.csv")

# Loading Processed Data
### (for work continued)

In [3]:
df = pd.read_csv("All_Tweets.csv")
df.head(2)

Unnamed: 0.1,Unnamed: 0,user_id,username,date,time,tweet,text,textblob,nltk_tweet,sent140,...,Donovan Mitchell,Kyle Lowry,Damian Lillard,DeMar DeRozan,Isaiah Thomas,John Wall,LaMarcus Aldridge,Paul George,Stephen Curry,Victor Oladipo
0,0,35993422,adaimiel,2020-03-08,03:43:38,😳😳,-,-1.0,-1,0.216302,...,0,0,0,0,0,0,0,0,0,0
1,1,35993422,adaimiel,2020-03-08,01:19:22,"Con 43, mirando así al defensor: https://twitt...",con mirando as al defensor,-1.0,1,-0.087027,...,0,0,0,0,0,0,0,0,0,0


In [22]:
df.shape

(610578, 43)

11 twitter-realted columns with 32 player-related binary columns

# TextBlob Sentiment Score

In [42]:
from textblob import TextBlob
df["textblob"] =  df["text"].apply(lambda txt: TextBlob(txt).sentiment.polarity)

In [46]:
sum(df["textblob"])

46709.574464646794

# NLTK Twitter Data Sentiment Score

In [27]:
### Positive/Negative Training and Testing using nltk twitter sample dataset
from nltk.corpus import twitter_samples

In [28]:
positive_tweets = twitter_samples.strings("positive_tweets.json")
negative_tweets = twitter_samples.strings("negative_tweets.json")

In [122]:
positive_tweets_nice = []
negative_tweets_nice = []
for text in positive_tweets:
    positive_tweets_nice.append(word_tokenize(spacy_cleaner(text)))
for text in negative_tweets:
    negative_tweets_nice.append(word_tokenize(spacy_cleaner(text)))

In [123]:
positive_tweets_nice[0]

['followfriday',
 'franceinte',
 'pkuchly',
 'milipolparis',
 'for',
 'be',
 'top',
 'engage',
 'member',
 'in',
 'my',
 'community',
 'this',
 'week']

In [124]:
def get_tweets_for_model(tokens_list):
    for tweet_tokens in tokens_list:
        yield dict([token, True] for token in tweet_tokens)

In [125]:
positive_tokens_for_model = get_tweets_for_model(positive_tweets_nice)
negative_tokens_for_model = get_tweets_for_model(negative_tweets_nice)

In [126]:
positive_tweets_dataset = [(tweet_dict, "Positive")
                           for tweet_dict in positive_tokens_for_model]
negative_tweets_dataset = [(tweet_dict, "Negative")
                           for tweet_dict in negative_tokens_for_model]

In [127]:
import random
dataset = positive_tweets_dataset + negative_tweets_dataset
random.shuffle(dataset)

In [128]:
train_data = dataset[:8000]
test_data = dataset[2000:]

In [129]:
from nltk import classify
from nltk import NaiveBayesClassifier

In [130]:
classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(5))

Accuracy is: 0.895125
Most Informative Features
              unfollower = True           Positi : Negati =     31.3 : 1.0
                     sad = True           Negati : Positi =     27.1 : 1.0
                      ff = True           Positi : Negati =     25.2 : 1.0
                     bam = True           Positi : Negati =     25.2 : 1.0
                follower = True           Positi : Negati =     24.8 : 1.0
None


In [131]:
df["nltk_tweet"] = df["text"].apply(lambda txt: 1 if classifier.classify(dict([token, True] for token in (word_tokenize(txt)) ) ) == "Positive" else -1)

# Sentiment140 Sentiment Score

In [47]:
### Sentiment140 Implementation
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

import gensim

from collections import Counter
import pickle
import itertools

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [48]:
data_columns = ["target", "ids", "date", "flag", "user", "text"]
data_encoding = "ISO-8859-1"

In [None]:
sent_data = pd.read_csv("sentiment140.csv", encoding=data_encoding, names=data_columns)

In [None]:
print("Dataset shape:", sent_data.shape)
sent_data.head(2)

In [49]:
train_size = 0.8
test_size = 0.2

text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [50]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}

def decode_sentiment(label):
    return decode_map[int(label)]

In [None]:
sent_data.target = sent_data.target.apply(lambda label: decode_sentiment(label))

In [None]:
target_count = Counter(sent_data.target)

print(target_count)

In [None]:
def preprocess(text, stem=False):
    text = re.sub(text_cleaning_re, " ", str(text).lower()).strip()
    return lemmatizer(remove_stopwords(text))

In [None]:
sent_data.text = sent_data.text.apply(lambda txt: spacy_cleaner(txt))

In [None]:
sent_data.to_csv("sentiment140_processed.csv", index=False)

In [51]:
sent_data = pd.read_csv("sentiment140_processed.csv")

In [52]:
data_columns = ["target", "ids", "date", "flag", "user", "text"]
data_encoding = "ISO-8859-1"
train_size = 0.8
test_size = 0.2
text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}

target_count = Counter(sent_data.target)

In [53]:
print(sent_data.shape)
sent_data.head(2)

(1600000, 6)


Unnamed: 0,target,ids,date,flag,user,text
0,NEGATIVE,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,awww bummer shoulda get david carr third day
1,NEGATIVE,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset update facebook texting might cry result...


In [54]:
sent_data["text"] = sent_data["text"].apply(lambda txt: str(txt))

In [55]:
sent_train, sent_test = train_test_split(sent_data, test_size=test_size)

In [56]:
### implementing Word2Vec

documents = [txt.split() for txt in sent_train.text]

In [57]:
w2v_size = 300
w2v_window = 7
w2v_epoch = 32
w2v_min_count = 10

w2v_model = gensim.models.word2vec.Word2Vec(size=w2v_size,
                                            window=w2v_window,
                                            min_count=w2v_min_count,
                                            workers=4)

In [58]:
w2v_model.build_vocab(documents)

In [59]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print(vocab_size)

25488


In [60]:
w2v_model.train(documents, total_examples=len(documents), epochs=w2v_epoch)

(256959754, 296706272)

In [61]:
w2v_model.most_similar("love")

  """Entry point for launching an IPython kernel.


[('amaze', 0.5625813007354736),
 ('adore', 0.5382034778594971),
 ('luv', 0.5316988229751587),
 ('awesome', 0.4931204319000244),
 ('loove', 0.48716920614242554),
 ('looove', 0.4643394351005554),
 ('loooove', 0.4167487323284149),
 ('lovee', 0.4045538604259491),
 ('reminds', 0.38943615555763245),
 ('miss', 0.3852258622646332)]

In [62]:
# tokenizing X (text)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sent_train.text)

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

273142


In [63]:
# padding X (text)

sequence_length = 300

X_train = pad_sequences(tokenizer.texts_to_sequences(sent_train.text),
                       maxlen=sequence_length)
X_test = pad_sequences(tokenizer.texts_to_sequences(sent_test.text),
                      maxlen=sequence_length)

In [64]:
sent_train.head(2)

Unnamed: 0,target,ids,date,flag,user,text
745553,NEGATIVE,2282454677,Mon Jun 22 11:23:24 PDT 2009,NO_QUERY,crosswalkkarma,come date unfortunately
626085,NEGATIVE,2230470335,Thu Jun 18 17:09:33 PDT 2009,NO_QUERY,flyingjenny,sorry hear


In [65]:
# encoding y (sentiment label)

POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"

labels = sent_train.target.unique().tolist()
labels.append(NEUTRAL)
labels

['NEGATIVE', 'POSITIVE', 'NEUTRAL']

In [66]:
encoder = LabelEncoder()
encoder.fit(sent_train.target.tolist())

LabelEncoder()

In [67]:
y_train = encoder.transform(sent_train.target.tolist())
y_test = encoder.transform(sent_test.target.tolist())

In [68]:
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

In [69]:
print("X_train", X_train.shape)
print("y_train", y_train.shape, "\n")
print("X_test", X_test.shape)
print("y_test", y_test.shape)

X_train (1280000, 300)
y_train (1280000, 1) 

X_test (320000, 300)
y_test (320000, 1)


In [103]:
# building neural network

embedding_matrix = np.zeros((vocab_size, w2v_size)) # (273247, 300)

# tokenizer.word_index.items() is a dictionary
#   of many "word": index_number made into a list
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

(273142, 300)


In [104]:
model = Sequential()
model.add(Embedding(vocab_size, w2v_size, weights=[embedding_matrix],
                   input_length=sequence_length, trainable=False))
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 300)          81942600  
_________________________________________________________________
dropout_1 (Dropout)          (None, 300, 300)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 82,103,101
Trainable params: 160,501
Non-trainable params: 81,942,600
_________________________________________________________________


In [105]:
model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [106]:
callbacks = [ ReduceLROnPlateau(monitor="val_loss", patience=5, cooldown=0),
              EarlyStopping(monitor="val_acc", min_delta=1e-4, patience=5)]

In [None]:
epochs = 3
batch_size = 1024

history = model.fit(X_train, y_train, batch_size=batch_size,epochs=epochs,
                    validation_split=0.1, callbacks=callbacks)

In [None]:
from keras.models import load_model
model.save('sentiment140_model.h5')

In [None]:
# evaluating the model

score = model.evaluate(X_test, y_test, batch_size=batch_size)
print("accuracy:", score[1])
print("loss:", score[0])

### (for work continued)

In [194]:
from keras.models import load_model
model = load_model('sentiment140_model.h5')

In [108]:
def decode_sentiment(score, include_neutral=True, sentiment_thresholds=[0.3, 0.7]):
    POSITIVE = "POSITIVE"
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    
    if include_neutral:
        label = NEUTRAL
        if score <= sentiment_thresholds[0]:
            label = NEGATIVE
        elif score >= sentiment_thresholds[1]:
            label = POSITIVE
        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

In [192]:
def predict(text, include_neutral=True):
    sequence_length = 300
    
    # padding the X test (text) to have the same sequence length
    XX_test = pad_sequences(tokenizer.texts_to_sequences([text]),
                            maxlen=sequence_length)
    score = model.predict([XX_test])[0]
    label = decode_sentiment(score, include_neutral=include_neutral)
    
    return {"label": label, "score": float(score)}

In [132]:
df["sent140"] = df["text"].apply(lambda txt: 2*(predict_sent(txt)["score"]) - 1)

# Emoticon Data Sentiment Score

In [133]:
df_positive = pd.read_csv("positive.csv", encoding='ISO-8859-1')
df_negative = pd.read_csv("negative.csv", encoding='ISO-8859-1')
df_emoticon = pd.concat([df_positive,df_negative],ignore_index = True)
df_emoticon.head()

Unnamed: 0,ï»¿id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1.24287e+18,1.24287e+18,1585160000000.0,26/3/2020,1:50:54,China Standard Time,25539920.0,sarahramsingh,Sarah Ramsingh,,...,,,,,,"[{'user_id': '25539918', 'username': 'SarahRam...",,,,
1,1.24234e+18,1.24234e+18,1585030000000.0,24/3/2020,14:26:45,China Standard Time,3343737000.0,trinex_yt,FsK TrinexTV,,...,,,,,,"[{'user_id': '3343736834', 'username': 'TRINEX...",,,,
2,1.24212e+18,1.24192e+18,1584980000000.0,24/3/2020,0:06:24,China Standard Time,9.46581e+17,westin_hudnall,Westin HudnAll,,...,,,,,,"[{'user_id': '946581212549844994', 'username':...",,,,
3,1.24116e+18,1.24116e+18,1584750000000.0,21/3/2020,8:36:24,China Standard Time,1.04947e+18,honestlyagoat,Dyphrant,,...,,,,,,"[{'user_id': '1049465403406204928', 'username'...",,,,
4,1.24091e+18,1.24067e+18,1584690000000.0,20/3/2020,16:02:41,China Standard Time,2384059000.0,pgtrym91,#PgTrym91 Norwegian ???????? on Xbox,,...,,,,,,"[{'user_id': '2384059068', 'username': 'PgTrym...",,,,


In [134]:
df_emoticon = df_emoticon[["user_id", "username", "date", "time", "tweet"]]

In [135]:
positive_emoticon = [':)',':-)',': )',':D','=)']
negative_emoticon = [':(',':-(',': (']
list_emoticon = positive_emoticon + negative_emoticon
def label_emoticon(row,r=0):
    for e in list_emoticon:
        if e in row:
            if e in positive_emoticon:
                r+=1
            if e in negative_emoticon:
                r-=1
            continue
    return r

In [136]:
series = []
for i in df_emoticon["tweet"]:
    series.append(label_emoticon(i))

In [137]:
from collections import Counter
print('Values of sentiments: ',Counter(series).keys()) # equals to list(set(words))
print('Counts of tweets with corresponding sentiments: ', Counter(series).values())

Values of sentiments:  dict_keys([1, 0, 2, -1, -2])
Counts of tweets with corresponding sentiments:  dict_values([2890, 130, 6, 2968, 6])


In [138]:
df_emoticon['emoticon'] = pd.Series(series)

In [139]:
df_emoticon["cleaned_tweet"] = df_emoticon["tweet"].apply(lambda txt: spacy_cleaner(txt))

In [140]:
emoticon_vect = TfidfVectorizer()
df_emoticon_X_train = emoticon_vect.fit_transform(df_emoticon["cleaned_tweet"])
df_emoticon_y_train = df_emoticon["emoticon"]

In [141]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

In [142]:
MultinomialNB_param = {
    'alpha': [0.1, 1, 5, 10], 
    'fit_prior': [True, False],
}

In [143]:
LogisticRegression_param = {
    "C": [3.8, 3.9, 4, 4.1, 4.2],
    "max_iter": [42, 43, 44, 250],
    "penalty": ["l2"]
}

In [144]:
DecisionTreeClassifier_param = {
    "criterion": ["gini", "entropy"],
    "max_depth": [1, 5, 10, 15], 
}

In [145]:
RandomForestClassifier_param = {
    "bootstrap": [True],
    "n_estimators": [200, 400],
    "max_depth": [10, 30, 60],
    "max_features": ["auto"],
    "min_samples_leaf": [2, 3],
    "min_samples_split": [2, 3],
}

In [146]:
XGBClassifier_param = {
    "n_estimators": [200, 400, 800],
    "objective": ["binary:logistic"],
    "min_child_weight": [1],
    "gamma": [0.5, 0.8, 1.0],
    "subsample": [0.5, 0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "max_depth": [1, 3, 5],
}

In [147]:
models = [
    MultinomialNB,
    LogisticRegression,
    DecisionTreeClassifier,
    RandomForestClassifier,
    XGBClassifier
]

param = {
    "MultinomialNB": MultinomialNB_param,
    "LogisticRegression": LogisticRegression_param,
    "DecisionTreeClassifier": DecisionTreeClassifier_param,
    "RandomForestClassifier": RandomForestClassifier_param,
    "XGBClassifier": XGBClassifier_param
}

In [148]:
model_result = pd.DataFrame(
    columns = [
        "model",
        "parameter",
        "cross_val_score",
        "train_accuracy",
    ]
)
model_index = 0

In [150]:
for model in models:
    clf = model()

    # param_grid = param[model.__name__]
    param_grid = param[str(model.__name__)]

    emoticon_grid = GridSearchCV(clf, param_grid, cv=5, scoring="accuracy").fit(df_emoticon_X_train, df_emoticon_y_train)

    print("------------------\nModel: %s" % str(model.__name__))
    print("Best cross validation score is:", emoticon_grid.best_score_)
    print(emoticon_grid.best_params_)
    
    y_pred = emoticon_grid.predict(df_emoticon_X_train)
    
    model_result.loc[model_index] = [
        model.__name__,
        emoticon_grid.best_params_,
        emoticon_grid.best_score_,
        accuracy_score(df_emoticon_y_train, y_pred)
    ]

    model_index += 1

------------------
Model: MultinomialNB
Best cross validation score is: 0.7296666666666667
{'alpha': 5, 'fit_prior': False}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


------------------
Model: LogisticRegression
Best cross validation score is: 0.7421666666666666
{'C': 3.9, 'max_iter': 44, 'penalty': 'l2'}
------------------
Model: DecisionTreeClassifier
Best cross validation score is: 0.6226666666666667
{'criterion': 'gini', 'max_depth': 10}
------------------
Model: RandomForestClassifier
Best cross validation score is: 0.7230000000000001
{'bootstrap': True, 'max_depth': 60, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 400}


KeyboardInterrupt: 

In [152]:
model_result.to_csv("emoticon_model_result.csv", index=False)

In [153]:
best_clf = LogisticRegression(C=3.9, max_iter=44, penalty="l2")
best_clf.fit(df_emoticon_X_train, df_emoticon_y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=3.9, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=44,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [158]:
df_emoticon_X_test = emoticon_vect.transform(df["text"])
df["emoticon"] = best_clf.predict_proba(df_emoticon_X_test)[:,1]
df["emoticon"] = df["emoticon"].apply(lambda x: 2*x - 1)

# Creating Player Columns

In [None]:
original_columns = ["user_id", "username", "date", "time", "tweet", "text", "textblob", "nltk_tweet", "sent140", "emoticon"]
df = df[original_columns]

In [None]:
for player in players:
   df[player] = df["text"].apply(lambda txt: 1 if any(nickname in txt for nickname in nicknames_dict[player]) else 0)

# Saving Current Work into .csv File

In [159]:
df.to_csv("all_tweets_with_added_columns.csv")

# Testing on Sample Tweets (Labelled Manually)

In [162]:
sample_df = pd.read_csv("tweet_samples_manual.csv")
print(sample_df.shape)
sample_df.head(2)

(104, 46)


Unnamed: 0.2,Unnamed: 0,year,Unnamed: 0.1,user_id,username,date,time,tweet,text,manual,...,Donovan Mitchell,Kyle Lowry,Damian Lillard,DeMar DeRozan,Isaiah Thomas,John Wall,LaMarcus Aldridge,Paul George,Stephen Curry,Victor Oladipo
0,109090,2020,109090,21735948,detnewsrodbeard,1/3/2020,10:08:10,"For what it's worth, #Pistons Dwane Casey says...",for what it be worth pistons dwane casey say b...,1,...,0,0,0,0,0,0,0,0,0,0
1,597861,2020,597861,209716158,pompeyonsixers,2/23/2020,19:25:32,"Sixers lose to Bucks, 119-98, as Ben Simmons l...",sixer lose to buck as ben simmons leave early ...,-1,...,0,0,0,0,0,0,0,0,0,0


In [163]:
sample_df["textblob"] = sample_df["text"].apply(lambda txt: TextBlob(txt).sentiment.polarity)

In [164]:
sample_df["nltk_tweet"] = sample_df["text"].apply(lambda txt: 1 if classifier.classify(dict([token, True] for token in (word_tokenize(txt)) ) ) == "Positive" else -1)

In [197]:
sample_df["sent140"] = sample_df["text"].apply(lambda txt: 2*(predict(str(txt))["score"]) - 1)

In [198]:
sample_df_X_test = emoticon_vect.transform(sample_df["text"])
sample_df["emoticon"] = best_clf.predict_proba(sample_df_X_test)[:,1]
sample_df["emoticon"] = sample_df["emoticon"].apply(lambda x: 2*x - 1)

In [167]:
sample_df.to_csv("tweet_samples_manual_tested.csv")

In [200]:
manual_columns = ["manual", "textblob", "nltk_tweet", "sent140", "emoticon"]
sample_df2 = sample_df[manual_columns]
sample_df2.head(2)

Unnamed: 0,manual,textblob,nltk_tweet,sent140,emoticon
0,1,-0.05,1,0.544746,-0.105417
1,-1,0.1,1,0.54733,0.093106


In [203]:
textblob_mse = sum((sample_df2["manual"] - sample_df2["textblob"])**2)
nltk_mse = sum((sample_df2["manual"] - sample_df2["nltk_tweet"])**2)
sent_mse = sum((sample_df2["manual"] - sample_df2["sent140"])**2)
emot_mse = sum((sample_df2["manual"] - sample_df2["emoticon"])**2)

textblob_mae = sum(abs(sample_df2["manual"] - sample_df2["textblob"]))
nltk_mae = sum(abs(sample_df2["manual"] - sample_df2["nltk_tweet"]))
sent_mae = sum(abs(sample_df2["manual"] - sample_df2["sent140"]))
emot_mae = sum(abs(sample_df2["manual"] - sample_df2["emoticon"]))

In [204]:
print(textblob_mse, nltk_mse, sent_mse, emot_mse)
print(textblob_mae, nltk_mae, sent_mae, emot_mae)

68.91568035564572 135 141.82335193499506 147.6100668023635
76.2532406720264 75 103.7680677017197 110.10366860415026
