In [105]:
import h2o
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
import re

import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

In [3]:
#Words are the integral part of any classification technique. 
#However, these words are often used with different variations in the text depending on their grammar (verb, adjective, noun, etc.).
#It is always a good practice to normalize the terms to their root forms.
#This technique is known as Lemmatization.

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [4]:
train = pd.read_csv('./data/train.csv')
prediction_frame = pd.read_csv('./data/test.csv')

In [5]:
train.shape

(7613, 5)

In [6]:
train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
#
# cardinality
#
train.apply(pd.Series.nunique)

id          7613
keyword      221
location    3341
text        7503
target         2
dtype: int64

In [117]:
#
# show examples for positives and negatives in training data
#
print("pos:", train[train["target"] == 1]["text"].values[30])
print("neg:", train[train["target"] == 0]["text"].values[0])

pos: Reported motor vehicle accident in Curry on Herman Rd near Stephenson involving an overturned vehicle. Please use... 
neg: What's up man?


In [118]:
#
# clean text from urls
#
train['text'] = train['text'].apply(lambda x: re.split('http[s]?:\/\/.*', str(x))[0])

In [159]:
# try different vectorization methods to improve model performance (TFIDF, LSA, LSTM / RNNs)

#tfidf vectorizer (tatistical measure that evaluates how relevant a word is to a document in a collection of documents.)

tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(tokenizer=LemmaTokenizer(), ngram_range=(1,1))


## get counts for the first 5 tweets in the data
example_train_vectors = tfidf_vectorizer.fit_transform(train["text"][0:5])

In [160]:
print(example_train_vectors.todense().shape)
print(example_train_vectors[1].todense())

(5, 57)
[[0.         0.         0.         0.29167942 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.36152912 0.         0.         0.
  0.         0.36152912 0.36152912 0.         0.         0.
  0.         0.         0.         0.36152912 0.         0.36152912
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.36152912 0.         0.36152912
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]]


In [161]:
train_vectors = tfidf_vectorizer.fit_transform(train["text"])
prediction_frame_vectors = tfidf_vectorizer.transform(prediction_frame["text"])

In [162]:
clf = linear_model.RidgeClassifier(alpha=0.8, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [163]:
scores = model_selection.cross_val_score(clf, train_vectors, train["target"], cv=3, scoring="f1")
scores

array([0.6461039 , 0.60572012, 0.69129801])

In [72]:
clf.fit(train_vectors, train["target"])
sample_submission = pd.read_csv("./data/sample_submission.csv")
sample_submission["target"] = clf.predict(prediction_frame_vectors)
sample_submission.to_csv("./data/output_submission.csv", index=False)

In [None]:
# best output yet
#array([0.65610143, 0.60869565, 0.69466403])