In [1]:
import pandas as pd
from cytoolz import identity
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
print(sorted(STOP_WORDS))

["'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'both', 'bottom', 'but', 'by', 'ca', 'call', 'can', 'cannot', 'could', 'did', 'do', 'does', 'doing', 'done', 'down', 'due', 'during', 'each', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'have', 'he', 'hence', 'her', 'here', 'he

In [3]:
df= pd.read_csv('classification.csv', index_col=0)
df.shape, df.head()

((15000, 6),
             date                                           headline  itemid  \
 1829  1997-01-22   Libya says plane flew to Ghana despite U.N. ban.  325883   
 1050  1997-07-06      Dirty laundry to be aired at Senate hearings.  709460   
 407   1996-09-08      BA denies report of 10,000 imminent job cuts.   39622   
 376   1996-12-30  Gulf states could reinstate Israel boycott - p...  281305   
 728   1997-03-06       Volvo Belgian plant to hold Friday stoppage.  425537   
 
                                                    text  sports  politics  
 1829  Libya said on Wednesday a Libyan plane had flo...   False     False  
 1050  A Senate investigation into political fund-rai...   False      True  
 407   British Airways (BA) denied on Monday a newspa...   False     False  
 376   Gulf Arab states could reinstate a boycott the...   False     False  
 728   Swedish vehicle maker AB Volvo said on Thursda...   False     False  )

In [4]:
test= pd.read_csv('classification_test.csv', index_col=0)

In [5]:
nlp = spacy.load('en_core_web_md', disable=['tagger', 'ner', 'parser'])

In [6]:
def tokenize(text):
    return [tok.text for tok in nlp.tokenizer(text.lower()) if (tok.text not in STOP_WORDS) and ((tok.text).isalpha())]
df['tokens'] = df['text'].apply(tokenize)
test['tokens'] = test['text'].apply(tokenize)

In [7]:
df.tokens.head()

1829    [libya, said, wednesday, libyan, plane, flown,...
1050    [senate, investigation, political, fund, raisi...
407     [british, airways, ba, denied, monday, newspap...
376     [gulf, arab, states, reinstate, boycott, lifte...
728     [swedish, vehicle, maker, ab, volvo, said, thu...
Name: tokens, dtype: object

In [8]:
sum(df['sports']), df.shape[0]

(2562, 15000)

The classifier below assigns the most frequent class as a prediction for every record.

In [9]:
from sklearn.feature_extraction.text import *

In [10]:
dtm = CountVectorizer(analyzer=identity)
dtm.fit(df['tokens'])
dtm.transform([df.iloc[0]['tokens']])

<1x75839 sparse matrix of type '<class 'numpy.int64'>'
	with 153 stored elements in Compressed Sparse Row format>

In [11]:
X = dtm.fit_transform(df['tokens'])
X_test = dtm.transform(test['tokens'])
X.shape

(15000, 75839)

 Before removal of non-letters and stopwords X dimensions were (15000, 93050).

In [12]:
from sklearn.naive_bayes import *

In [13]:
modelNB = BernoulliNB()

In [14]:
modelNB.fit(X, df['sports'])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [15]:
modelNB.predict(X[0])

array([False])

In [16]:
df.iloc[0]

date                                               1997-01-22
headline     Libya says plane flew to Ghana despite U.N. ban.
itemid                                                 325883
text        Libya said on Wednesday a Libyan plane had flo...
sports                                                  False
politics                                                False
tokens      [libya, said, wednesday, libyan, plane, flown,...
Name: 1829, dtype: object

In [17]:
predictions = modelNB.predict(X_test)

I would like a function for error metrics computation, because I will compute different models and want to compare results.

In [18]:
def error_metrics(true_values, predictions):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    accuracy = accuracy_score(true_values, predictions)
    precision = precision_score(true_values, predictions)
    recall = recall_score(true_values, predictions)
    f1 = f1_score(true_values, predictions)
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1':f1}

In [19]:
err = error_metrics(true_values=test['sports'], predictions=predictions)

Only removal of stop words makes predictions worse.

Let us save the errors in a data frame.

In [20]:
errors = pd.DataFrame({'metrics': ['accuracy', 'precision', 'recall', 'f1']})
errors['BernoulliNB']= [err['accuracy'],  err['precision'], err['recall'], err['f1']]

## Feature Scaling
Scaling push numbers into middle range, somewhere between -4 and 4, most of them not close to 0. These numbers are computed with the most presicion by computers, because truncation errors for very small numbers may accumulate. 

In [21]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X = sc.fit_transform(X)
X_test = sc.transform(X_test)


## Fitting Support Vector Machines to the Training set

The code below takes about 7 minutes.

In [22]:
%%time

from sklearn.svm import SVC
modelSVC = SVC(kernel = 'rbf', random_state = 0, gamma='auto')
modelSVC.fit(X, df['sports'])

Wall time: 18min 11s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [23]:
predictions = modelSVC.predict(X_test)

In [24]:
cm = confusion_matrix(test['sports'], predictions)
print('confusion matrix:')
cm

NameError: name 'confusion_matrix' is not defined

In [None]:
err = error_metrics(true_values=test['sports'], predictions=predictions)
errors['SVC']=[err['accuracy'],  err['precision'], err['recall'], err['f1']]

## Logarithmic regression
A logarithmic regression fit a line (plane, hyperplane) between data points. 
It optimize the following equation:
$$ \log\left(\frac{odds}{1-odds}\right) = w_1 x_1+ w_2 x_2 + \cdots + w_n x_n
$$

Loss function: $$\text{Log Loss} = \sum_{(x,y)\in D} -y\log(y') - (1 - y)\log(1 - y')$$

Here we consider so calles Elastic Logarithmic regression, with penalties for I would like to use Penalty l2 means a restriction for coefficient growth. It helps with controlling outlier influence and to avoid overfitting.

Penalty l1 drops useless for prediction variables. Regretfully we do not have explicit information on what variables are dropped.

The code below takes about 15-16 minutes, and comes with a warning that the process did not converge. We still can use the model. 

In [None]:
%%time

from sklearn.linear_model import LogisticRegression
modelLG = LogisticRegression(penalty = 'elasticnet', solver ='saga', C=0.2,  l1_ratio =.9, max_iter=300, random_state = 0)
modelLG.fit(X, df['sports'])

In [None]:
predictions = modelLG.predict(X_test)

Computing of error metrics and a confusion matrix:

In [None]:
err = error_metrics(true_values=test['sports'], predictions=predictions)
errors['LogisticRegression']=[err['accuracy'],  err['precision'], err['recall'], err['f1']]
cm = confusion_matrix(test['sports'], predictions)
print('confusion matrix:')
cm

## Random Forest method

This was quick and with good accuracy.

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
%%time

modelRF = RandomForestClassifier(n_estimators=500)
modelRF.fit(X, df['sports'])

In [None]:
predictions = modelRF.predict(X_test)

In [None]:
err = error_metrics(true_values=test['sports'], predictions=predictions)
errors['RandomForest'] = [err['accuracy'],  err['precision'], err['recall'], err['f1']]
cm = confusion_matrix(test['sports'], predictions)
print('confusion matrix:')
cm

In [None]:
errors['RandomForest'] = [accuracy, precision, recall, f1]

In [None]:
errors