In [None]:
#Libraries used for data importing and cleaning
import pandas as pd
import re
import string

#Libraries used for TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

#Random forest library
from sklearn.ensemble import RandomForestClassifier

#To score the results
from sklearn import model_selection

In [None]:
#Import the data
tweet = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
#Next I am going to concat the two datasets so that the data cleaning I do on them are equal
df=pd.concat([tweet,test])

In [None]:
#There are 3 universal basic steps for data cleaning that apply to this data
#Removing url's, html tags, and punctuation
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [None]:
#now to execute the functions to cclean the data
df['text']=df['text'].apply(lambda x : remove_URL(x))
df['text']=df['text'].apply(lambda x : remove_html(x))
df['text']=df['text'].apply(lambda x : remove_punct(x))

In [None]:
#Another common task that must be dealt with is the emojis. 
#Some people simply like to remove them and not deal with them
#Others like to use them as additional features. 
#For now I will take thhe path of least resistence and simply remove them
df_noemoji = df.copy()

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
df_noemoji['text']=df_noemoji['text'].apply(lambda x: remove_emoji(x))

In [None]:
#The next thing that needs to be done is to tokenize the text, I will make a new DF for this
df_token_noemoji = df_noemoji.copy()

In [None]:
df_token_noemoji.head()

In [None]:
#This function
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

df_token_noemoji['text'] = df_token_noemoji['text'].apply(lambda x: tokenize(x.lower()))

In [None]:
#The next decision I am going to make is to drop the keyword and location columns as they are not useful
df_token_noemoji_drop = df_token_noemoji.drop(['id','keyword','location'], axis=1)

In [None]:
#Now that I have the most basic data structure that I want, I can start building my first model
#I'm simply going to use TFIDF and a random forest model for my baseline
#First I need to split the data back into training and testing sets
training_df=df_token_noemoji_drop[df_token_noemoji_drop['target'].notnull()]
test_df=df_token_noemoji_drop[df_token_noemoji_drop['target'].isnull()]
X_train = training_df.drop('target', axis=1)
y_train = training_df['target']
X_test = test_df.drop('target', axis=1)

In [None]:
X_test.head()

In [None]:
#Now to do some TF-IDF
#I was getting an list has no attribute .lower error. I found this work around at 
#http://www.davidsbatista.net/blog/2018/02/28/TfidfVectorizer/
def dummy_fun(doc):
    return doc

tfidf_vect = TfidfVectorizer(analyzer='word',
                             tokenizer=dummy_fun,
                             preprocessor=dummy_fun,
                             token_pattern=None)
tfidf_vect.fit(X_train['text'])
X_train_vect = tfidf_vect.transform(X_train['text'])
X_test_vect = tfidf_vect.transform(X_test['text'])

In [None]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [None]:
def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

In [None]:
scores = model_selection.cross_val_score(rf, X_train_vect, y_train, cv=5, scoring="f1")
scores

In [None]:
#Once again this is what I would consider an absolute baseline model. 
#It could be a little bit lighter but the point is that this is a very basic model and its only up from here
#And there is a long way up to go. This model's scores com