In [28]:
#Libraries used for data importing and cleaning
import pandas as pd
import re
import string

#Random forest library
from sklearn.ensemble import RandomForestClassifier

#To score the results
from sklearn import model_selection

In [2]:
#Import the data
tweet = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
#Next I am going to concat the two datasets so that the data cleaning I do on them are equal
df=pd.concat([tweet,test])

In [5]:
#I am going to drop the id, keyword, and location columns now
df = df.drop(['id','keyword','location'], axis=1)

In [7]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [10]:
df['text']=df['text'].apply(lambda x : remove_URL(x))
df['text']=df['text'].apply(lambda x : remove_html(x))
df['text']=df['text'].apply(lambda x : remove_punct(x))


In [11]:
df_noemoji = df.copy()
df_noemoji['text']=df_noemoji['text'].apply(lambda x: remove_emoji(x))

In [12]:
#The next thing that needs to be done is to tokenize the text, I will make a new DF for this
df_token_noemoji = df_noemoji.copy()

In [13]:
#This function
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

df_token_noemoji['text'] = df_token_noemoji['text'].apply(lambda x: tokenize(x.lower()))

In [15]:
#Now that I have the most basic data structure that I want, I can start building my first model
#I'm simply going to use TFIDF and a random forest model for my baseline
#First I need to split the data back into training and testing sets
training_df=df_token_noemoji[df_token_noemoji['target'].notnull()]
test_df=df_token_noemoji[df_token_noemoji['target'].isnull()]
X_train = training_df.drop('target', axis=1)
y_train = training_df['target']
X_test = test_df.drop('target', axis=1)

In [19]:
import gensim
import numpy as np

In [18]:
w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

In [21]:
words = set(w2v_model.wv.index2word)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                        for ls in X_train['text']])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                        for ls in X_test['text']])

In [22]:
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))

X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [25]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [30]:
scores = model_selection.cross_val_score(rf, X_train_vect_avg, y_train, cv=5, scoring="f1")
scores

array([0.00911854, 0.01510574, 0.00607903, 0.        , 0.01215805])