In [186]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, naive_bayes, model_selection, preprocessing


In [187]:
train_df = pd.read_csv("datasets/train.csv")
test_df = pd.read_csv("datasets/test.csv")


In [188]:
# not a disaster
train_df[train_df["target"] == 0]["text"].values[1]


'I love fruits'

In [189]:
# disaster
train_df[train_df["target"] == 1]["text"].values[1]


'Forest fire near La Ronge Sask. Canada'

In [190]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())


(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [191]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors -
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

In [192]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression
## is a good way to do this.
# clf = linear_model.RidgeClassifierCV()
# clf = naive_bayes.MultinomialNB(alpha=2)
clf = linear_model.SGDClassifier(
    loss="hinge", penalty="l2", alpha=1e-3, random_state=42, max_iter=1000, tol=None
)


In [193]:
scores = model_selection.cross_val_score(
    clf, train_vectors, train_df["target"], cv=5, scoring="f1"
)
scores


array([0.61845861, 0.52741514, 0.61806131, 0.56586271, 0.70248597])

In [194]:
clf.fit(train_vectors, train_df["target"])


SGDClassifier(alpha=0.001, random_state=42, tol=None)

In [195]:
sample_submission = pd.read_csv("datasets/sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)
sample_submission.to_csv("datasets/submission.csv", index=False)
