In [None]:
#Libraries used for data importing and cleaning
import pandas as pd
import numpy as np
import re
import string

#Libraries used for TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

#Random forest library
from sklearn.ensemble import RandomForestClassifier

#To score the results
from sklearn import model_selection

In [None]:
#Import the data
tweet = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
#Next I am going to concat the two datasets so that the data cleaning I do on them are equal
df=pd.concat([tweet,test])

In [None]:
#I am going to drop the id, keyword, and location columns now
df = df.drop(['id','keyword','location'], axis=1)

In [None]:
df_feature = pd.DataFrame()

In [None]:
## Number of unique words in the text ##
df_feature['num_words'] = df['text'].apply(lambda x: len(str(x).split()))

## Number of characters in the text ##
df_feature["num_unique_words"] = df["text"].apply(lambda x: len(set(str(x).split())))

## Number of stopwords in the text ##
df_feature["num_chars"] = df["text"].apply(lambda x: len(str(x)))

## Number of title case words in the text ##
df_feature["num_punctuations"] =df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

## Number of title case words in the text ##
df_feature["num_words_upper"] = df["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Average length of the words in the text ##
df_feature["num_words_title"] = df["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

## Mean length of the words in the text ##
df_feature["mean_word_len"] = df["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [None]:
df_feature.head()

In [None]:
#There are 3 universal basic steps for data cleaning that apply to this data
#Removing url's, html tags, and punctuation
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

#I am also going to add in the step of removing the emoji's and emoticons
#They too few to be helpful but present enough to be annoying
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
#now to execute the functions to cclean the data
df['text']=df['text'].apply(lambda x : remove_URL(x))
df['text']=df['text'].apply(lambda x : remove_html(x))
df['text']=df['text'].apply(lambda x : remove_punct(x))
df['text']=df['text'].apply(lambda x: remove_emoji(x))

In [None]:
#Now to tokenize
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

In [None]:
df_token = df.copy()
df_token['text'] = df_token['text'].apply(lambda x: tokenize(x.lower()))

In [None]:
df_token.head()

In [None]:
#Next I am going to concat the two datasets so that the data cleaning I do on them are equal
df_model=pd.concat([df_feature,df_token], axis=1)

In [None]:
df_model.head()

In [None]:
training_df=df_model[df_model['target'].notnull()]
test_df=df_model[df_model['target'].isnull()]
X_train = training_df.drop('target', axis=1)
y_train = training_df['target']
X_test = test_df.drop('target', axis=1)

In [None]:
X_test.head()

In [None]:
#Now to do some TF-IDF
#I was getting an list has no attribute .lower error. I found this work around at 
#http://www.davidsbatista.net/blog/2018/02/28/TfidfVectorizer/
def dummy_fun(doc):
    return doc

tfidf_vect = TfidfVectorizer(analyzer='word',
                             tokenizer=dummy_fun,
                             preprocessor=dummy_fun,
                             token_pattern=None)
tfidf_vect.fit(X_train['text'])
X_train_vect = tfidf_vect.transform(X_train['text'])
X_test_vect = tfidf_vect.transform(X_test['text'])

In [None]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [None]:
def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

In [None]:
scores = model_selection.cross_val_score(rf, X_train_vect, y_train, cv=5, scoring="f1")
scores