In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

## Prepare datasets

In [2]:
# Load data
data = pd.read_csv('../data/train.csv', index_col=0)

In [3]:
data.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Check the target variable distribution
print(data['target'].value_counts())

0    4342
1    3271
Name: target, dtype: int64


In [5]:
# Split into X and y
X, y = data['text'], data['target']

In [6]:
# Convert X to TFIDF feature vector
feature_extraction = TfidfVectorizer()
X_features = feature_extraction.fit_transform(X.values)

In [7]:
X_features

<7613x21637 sparse matrix of type '<class 'numpy.float64'>'
	with 111497 stored elements in Compressed Sparse Row format>

In [8]:
# Reduce the size of the vocabulary by setting `min_df` to 0.01
feature_extraction = TfidfVectorizer(min_df=0.01) 
X_features_reduced = feature_extraction.fit_transform(X.values)

In [9]:
X_features_reduced

<7613x150 sparse matrix of type '<class 'numpy.float64'>'
	with 44502 stored elements in Compressed Sparse Row format>

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_features_reduced, y, test_size=0.2, random_state=3)

## SVM classifier

In [11]:
# Define a 3-fold splits in grid search CV, random_state is fixed
fold = KFold(n_splits=3, shuffle=True, random_state=3)

In [12]:
# A grid of parameters in SVM
parameters = {'kernel':('linear', 'rbf'), 'C':[0.1, 1, 10, 100]}

clf = SVC(verbose=True)
cv_clf = GridSearchCV(clf, parameters, scoring='accuracy', cv=fold)

In [13]:
# Fit the CV model 
cv_clf.fit(X_train, y_train)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

GridSearchCV(cv=KFold(n_splits=3, random_state=3, shuffle=True),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=True),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100], 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [14]:
cv_clf.best_estimator_

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

In [15]:
# validation f1 score and report, using the best model from cv_clf
y_pred = cv_clf.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7399868680236376
              precision    recall  f1-score   support

           0       0.71      0.90      0.79       841
           1       0.81      0.55      0.65       682

    accuracy                           0.74      1523
   macro avg       0.76      0.72      0.72      1523
weighted avg       0.75      0.74      0.73      1523

