In [1]:
import h2o
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [2]:
train = pd.read_csv('./data/train.csv')
prediction_frame = pd.read_csv('./data/test.csv')

In [3]:
train.shape

(7613, 5)

In [4]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
#
# cardinality
#
train.apply(pd.Series.nunique)

id          7613
keyword      221
location    3341
text        7503
target         2
dtype: int64

In [7]:
train[train["target"] == 1]["text"].values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [8]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train["text"][0:5])

In [9]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)

# There are 54 unique words (or "tokens") in the first five tweets.
# The first tweet contains only some of those unique tokens - all of the non-zero counts above are the tokens that DO exist in the first tweet.

print(example_train_vectors.todense().shape)
print(example_train_vectors[0].todense())

(5, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [10]:
train_vectors = count_vectorizer.fit_transform(train["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
prediction_frame_vectors = count_vectorizer.transform(prediction_frame["text"])

In [11]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=True, random_state=None,
                solver='auto', tol=0.001)

In [12]:
#Let's test our model and see how well it does on the training data. 
#For this we'll use cross-validation - where we train on a portion of the known data, then validate it with the rest. 
#If we do this several times (with different portions) we can get a good idea for how a particular model or method performs.

#The metric for this competition is F1, so let's use that here.


scores = model_selection.cross_val_score(clf, train_vectors, train["target"], cv=3, scoring="f1")
scores

array([0.6174569 , 0.58363731, 0.70333988])

In [13]:
clf.fit(train_vectors, train["target"])

RidgeClassifier(normalize=True)

In [14]:
sample_submission = pd.read_csv("./data/sample_submission.csv")

In [15]:
sample_submission["target"] = clf.predict(prediction_frame_vectors)

NameError: name 'test_vectors' is not defined

In [53]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,1
4,11,1


In [55]:
sample_submission.to_csv("./data/output_submission.csv", index=False)