In [1]:
import pandas as pd
from cytoolz import identity
import spacy

In [2]:
df= pd.read_csv('classification.csv')
df.shape, df.head()

((15000, 7),
    Unnamed: 0        date                                           headline  \
 0        1829  1997-01-22   Libya says plane flew to Ghana despite U.N. ban.   
 1        1050  1997-07-06      Dirty laundry to be aired at Senate hearings.   
 2         407  1996-09-08      BA denies report of 10,000 imminent job cuts.   
 3         376  1996-12-30  Gulf states could reinstate Israel boycott - p...   
 4         728  1997-03-06       Volvo Belgian plant to hold Friday stoppage.   
 
    itemid                                               text  sports  politics  
 0  325883  Libya said on Wednesday a Libyan plane had flo...   False     False  
 1  709460  A Senate investigation into political fund-rai...   False      True  
 2   39622  British Airways (BA) denied on Monday a newspa...   False     False  
 3  281305  Gulf Arab states could reinstate a boycott the...   False     False  
 4  425537  Swedish vehicle maker AB Volvo said on Thursda...   False     False  )

In [3]:
test= pd.read_csv('classification_test.csv')

In [4]:
nlp = spacy.load('en_core_web_md', disable=['tagger', 'ner', 'parser'])

In [5]:
def tokenize(text):
    return [tok.orth_.lower() for tok in nlp.tokenizer(text)]
df['tokens'] = df['text'].apply(tokenize)
test['tokens'] = test['text'].apply(tokenize)

In [6]:
sum(df['sports']), df.head()

(2562,
    Unnamed: 0        date                                           headline  \
 0        1829  1997-01-22   Libya says plane flew to Ghana despite U.N. ban.   
 1        1050  1997-07-06      Dirty laundry to be aired at Senate hearings.   
 2         407  1996-09-08      BA denies report of 10,000 imminent job cuts.   
 3         376  1996-12-30  Gulf states could reinstate Israel boycott - p...   
 4         728  1997-03-06       Volvo Belgian plant to hold Friday stoppage.   
 
    itemid                                               text  sports  \
 0  325883  Libya said on Wednesday a Libyan plane had flo...   False   
 1  709460  A Senate investigation into political fund-rai...   False   
 2   39622  British Airways (BA) denied on Monday a newspa...   False   
 3  281305  Gulf Arab states could reinstate a boycott the...   False   
 4  425537  Swedish vehicle maker AB Volvo said on Thursda...   False   
 
    politics                                             tokens  

In [7]:
from sklearn.dummy import *
from sklearn.model_selection import *

In [8]:
model = DummyClassifier('most_frequent')
score = cross_val_score(model, df['tokens'], df['sports'], cv=5, n_jobs=-1)
score.mean()

0.8291999999999999

In [9]:
from sklearn.feature_extraction.text import *

In [10]:
dtm = CountVectorizer(analyzer=identity)
dtm.fit(df['tokens'])
dtm.transform([df.iloc[0]['tokens']])

<1x93292 sparse matrix of type '<class 'numpy.int64'>'
	with 222 stored elements in Compressed Sparse Row format>

In [11]:
X = dtm.fit_transform(df['tokens'])
X_test = dtm.transform(test['tokens'])
type(X)

scipy.sparse.csr.csr_matrix

In [12]:
from sklearn.naive_bayes import *

In [13]:
modelNB = BernoulliNB()

In [14]:
modelNB.fit(X, df['sports'])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [15]:
modelNB.predict(X[0])

array([False])

In [16]:
df.iloc[0]

Unnamed: 0                                                 1829
date                                                 1997-01-22
headline       Libya says plane flew to Ghana despite U.N. ban.
itemid                                                   325883
text          Libya said on Wednesday a Libyan plane had flo...
sports                                                    False
politics                                                  False
tokens        [libya, said, on, wednesday, a, libyan, plane,...
Name: 0, dtype: object

In [17]:
predictions = modelNB.predict(X_test)

Making the Confusion Matrix

In [18]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test['sports'], predictions)
cm

array([[4103,   20],
       [  58,  819]], dtype=int64)

## Feature Scaling
Scaling push numbers into middle range, somewhere between -4 and 4. These numbers are computed with the most presicion by computers.

Truncation errors may accumulate. 

In [19]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X = sc.fit_transform(X)
X_test = sc.transform(X_test)


## Fitting Support Vector Machines to the Training set

In [20]:
from sklearn.svm import SVC
modelSVC = SVC(kernel = 'rbf', random_state = 0, gamma='auto')
modelSVC.fit(X, df['sports'])


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [21]:
y_pred = modelSVC.predict(X_test)

In [22]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test['sports'], predictions)
cm

array([[4103,   20],
       [  58,  819]], dtype=int64)

## Logarithmic regression
A logarithmic regression fit a line (plane, hyperplane) between data points. 
It optimize the following equation:
$$ \log\left(\frac{odds}{1-odds}\right) = w_1 x_1+ w_2 x_2 + \cdots + w_n x_n
$$

Loss function: $$\text{Log Loss} = \sum_{(x,y)\in D} -y\log(y') - (1 - y)\log(1 - y')$$

In [23]:
X.shape

(15000, 93292)

Penalty l2 means a restriction for coefficient growth. 

In [24]:
from sklearn.linear_model import LogisticRegression
modelLG = LogisticRegression(penalty = 'elasticnet', solver ='saga', l1_ratio =.3,  random_state = 0)
modelLG.fit(X, df['sports'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0.3, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='elasticnet',
                   random_state=0, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
y_pred = modelLG.predict(X_test)

In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
f1 = f1_score(test['sports'], predictions)
f1

0.9545454545454546

In [27]:
from sklearn.neighbors import KNeighborsClassifier
modelKN = KNeighborsClassifier(n_neighbors=5)
modelKN.fit(X, df['politics'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [28]:
KN_pred = modelKN.predict(X_test)

In [29]:
KN_f1 = f1_score(test['politics'], predictions)
KN_f1

0.0008795074758135445