In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from  sklearn.metrics import classification_report
from gensim.models import Word2Vec
from nltk.corpus import stopwords

In [None]:
data = pd.read_csv("train.csv")
data.head()

In [None]:
def dropCols(columns, data):
    
    """
    Returns the data frame with columns removed
    
    input
    ------
    columns: list of column names
    data: pandas dataframe
    """
    data = data.drop(columns, axis=1)
    return(data)

In [None]:
#Can drop replyToSID, latitude, longitude, screenName
to_drop = ["replyToSID", "latitude", "longitude", "screenName", "id.1", "statusSource"]
data = dropCols(to_drop, data)

In [None]:
n,d = data.shape
n,d

In [None]:
#Creates new fields

data["has_link"] = -1
data["is_capitalized"] = -1
data["has_I"] = -1
data["has_Crooked"] = -1
data["has_TY"] = -1
data["has_Hillary"] = -1

hour = []

for i in range(len(data["text"])):
    if "https" in data["text"][i]:
        data["has_link"][i] = 1
        
    if (data["text"][i]).isupper():
        data["is_capitalized"][i] = 1
        
    if "I " in data["text"][i]:
        data["has_I"][i] = 1
        
    if "rooked" in data["text"][i]:
        data["has_Crooked"][i] = 1
    
    if "Thank you" in data["text"][i]:
        data["has_TY"][i] = 1
        
    if "Hillary" in data["text"][i]:
        data["has_Hillary"][i] = 1
        
    h_idx = data["created"][i].split()[1].rfind(":")
    hour.append(int(data["created"][i].split()[1][:h_idx]))
    
data["hour"] = hour
    
    
    

Label 1 is Android(Trump) and -1 is iphone(not Trump)

In [None]:
#splits the data into train/val with ration 0.8:0.2

yTr = data["label"]
xTr = data.loc[:, data.columns != 'label']
xTr, xVal, yTr, yVal = train_test_split(xTr, yTr, test_size=0.2, random_state=500)
id_tr = xTr["id"]
id_val = xVal["id"]
xTr.drop("id", axis=1, inplace = True)
xVal.drop("id", axis=1, inplace = True)

In [None]:
#Source: Sena's written function from CS 4300

features = 5000

def build_vectorizer(max_features, stop_words, max_df=0.8, min_df=10, norm='l2'):
    """Returns a TfidfVectorizer object
    
    Params: {max_features: Integer,
             max_df: Float,
             min_df: Float,
             norm: String,
             stop_words: String}
    Returns: TfidfVectorizer
    """
    
    tfidfvec = TfidfVectorizer(max_features = max_features, stop_words = stop_words, \
                    max_df = max_df, min_df = min_df, norm=norm)
    return tfidfvec
    
tfidf_vec = build_vectorizer(features, "english")
train_doc = tfidf_vec.fit_transform([d for d in xTr.text]).toarray()
val_doc = tfidf_vec.transform([d for d in xVal.text])

In [None]:
#Baseline model uses a Multinomial Naive Bayes classifier

clf = MultinomialNB().fit(train_doc, yTr)
predicted = clf.predict(val_doc)
print(classification_report(yVal,predicted))

### load in the Testing data

In [None]:
to_drop2 = ["replyToSID", "latitude", "longitude", "screenName", "id.1"]

test_data = pd.read_csv("test.csv")
test_data = dropCols(to_drop2, test_data)
test_data_id = test_data["id"]
test_data = test_data.loc[:, test_data.columns != 'id']

In [None]:
#Creates 2 new fields: 1 for if the text has a tweet and the other if the entire text is capitalized

test_data["has_link"] = -1
test_data["is_capitalized"] = -1
test_data["has_I"] = -1
test_data["has_Crooked"] = -1
test_data["has_TY"] = -1
test_data["has_Hillary"] = -1

t_hour = []

for i in range(len(test_data["text"])):
    if "https" in test_data["text"][i]:
        test_data["has_link"][i] = 1
        
    if (test_data["text"][i]).isupper():
        test_data["is_capitalized"][i] = 1
        
    if "I " in test_data["text"][i]:
        test_data["has_I"][i] = 1
        
    if "rooked" in test_data["text"][i]:
        test_data["has_Crooked"][i] = 1
        
    if "Thank you" in data["text"][i]:
        test_data["has_TY"][i] = 1        
        
    if "Hillary" in data["text"][i]:
        test_data["has_Hillary"][i] = 1
        
    h_idx = test_data["created"][i].split()[1].rfind(":")
    t_hour.append(int(test_data["created"][i].split()[1][:h_idx]))
        
test_data["hour"] = t_hour
        
        

In [None]:
test_doc = tfidf_vec.transform([d for d in test_data.text])
prediction_test = clf.predict(test_doc)

In [None]:
#Writes to csv.
#NB: you need to open CSV file and delete the first column and save before submitting to Kaggle

cols = {"ID":test_data_id, "Label": prediction_test}
df = pd.DataFrame(cols)
df.to_csv("MultiNB.csv")

## Word 2 vector representation of textual data

In [None]:
tokenizer = TreebankWordTokenizer()

def tokenizeText(text):
    
    """
    Returns tokenized text
    
    input
    ------
    text: list of tweets
    
    output
    -------
    tokenized_text: list of tokenized words
    
    """
    
    tokenized_text = []
    
    for row in text:
        words = tokenizer.tokenize(row)
        
        #remove stop words before appending
        tokenized_text.append([word for word in words if word not in stopwords.words('english')])

    return tokenized_text

tokenized_text_train = tokenizeText(xTr.text)
tokenized_text_test = tokenizeText(xVal.text)

In [None]:
#Train the w2v model
model = Word2Vec(tokenized_text_train, size=300, window=5, min_count=1, workers=4)

In [None]:
def getAvgVec(model, tokenized_text):
    
    """
    Returns a vector representation for all sentences
    
    input
    ------
    model: trained word2vec model
    tokenized_text: list of tokenized text
    
    output
    ------
    avg_vec_train: numpy array where vectors on each row represent each sentence in train set
    """
    
    avg_vec_train = np.zeros((len(tokenized_text),300))

    for i in range(len(tokenized_text)):

        vec = np.zeros((1,300))
        for item in tokenized_text[i]:
            if item not in model:
                continue
            else:
                vec += model.wv[item]

        vec /= len(tokenized_text[i])
        
        avg_vec_train[i, :] = vec
    
    return avg_vec_train

In [None]:
vecs_train = getAvgVec(model, tokenized_text_train)
vecs_val = getAvgVec(model, tokenized_text_test)

In [None]:
#Convert true/fale to 0/1
for col in xTr.columns:
    if (xTr[col]).dtype == "bool":
        xTr[col] *= 1
        xVal[col] *= 1

In [None]:
def concatenateColsAndArray(data, nparray, colsToRemove=[]):
    
    """
    Returns a numpy array of w2v concatenated with other columns
    
    input
    -------
    data: dataframe
    nparray: array of vecs for sentences
    colsToRemove: list of columns to exclude from dataframe
    
    output
    -------
    newdata: numpy array representation of data
    """
    
    if len(colsToRemove) !=0:
        newdata = data[data.columns.difference(colsToRemove)]
        newdata = np.concatenate((nparray, newdata.values), axis=1)
    else:
        newdata = np.concatenate((nparray, data.values), axis=1)
    
    return newdata

In [None]:
#concatenate other columns with array
cols_to_remove = ['created', 'replyToSN', "replyToUID", "text"]
rf_input_train = concatenateColsAndArray(xTr, vecs_train,cols_to_remove)

In [None]:
def RFClassifier(ntrees, nfeatures):
    """
    Builds Random Forest Classifier
    """
    model = RandomForestClassifier(n_estimators=ntrees, 
                                   #max_features=np.round(np.sqrt(nfeatures)),
                                  max_depth=30)
    return model

In [None]:
ntrees = 5000
nfeatures = 300
RFC = RFClassifier(ntrees, nfeatures)
forest = RFC.fit(rf_input_train, yTr)

In [None]:
#RF_val = xVal[xVal.columns.difference(['created', 'replyToSN', "replyToUID", "text"])]
rf_input_val = concatenateColsAndArray(xVal, vecs_val,cols_to_remove)

#np.concatenate((vecs_val, RF_val.values), axis=1)
res = forest.predict(rf_input_val)
print(classification_report(yVal,res))

In [None]:
tokenized_test = tokenizeText(test_data.text)
vecs_test = getAvgVec(model, tokenized_test)
rf_input_test = concatenateColsAndArray(test_data, vecs_test ,cols_to_remove)
test_res = forest.predict(rf_input_test)

In [None]:
#Writes to csv

cols = {"ID":test_data_id, "Label": test_res}
df = pd.DataFrame(cols)
df.to_csv("RFtext.csv")