In [None]:
import pandas as pd
import numpy as np
from cytoolz import identity
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import *

In [None]:
df = pd.read_csv("../input/newsdata/classification.csv")
test = pd.read_csv("../input/newsdata/classification_test.csv")

## Importance of features, reducing number of features.

In [None]:
nlp = spacy.load('en', disable=['tagger', 'ner', 'parser'])

In [None]:
def tokenize(text):
    return [tok.text for tok in nlp.tokenizer(text.lower()) if (tok.text not in STOP_WORDS) and ((tok.text).isalpha())]
df['tokens'] = df['text'].apply(tokenize)
test['tokens'] = test['text'].apply(tokenize)

In [None]:
dtm =  CountVectorizer(analyzer=identity)
dtm.fit(df['tokens'])

In [None]:
X = dtm.fit_transform(df['tokens'])
X_test = dtm.transform(test['tokens'])
X.shape

I would like a function for error metrics computation, because I will compute different models and want to compare results.

## Feature Scaling
Scaling push numbers into middle range, somewhere between -4 and 4, most of them not close to 0. These numbers are computed with the most presicion by computers, because truncation errors for very small numbers may accumulate. 

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X = sc.fit_transform(X)
X_test = sc.transform(X_test)

## Logistic regression
A logarithmic regression fit a line (plane, hyperplane) between data points. 
It optimize the following equation:
$$ \log\left(\frac{odds}{1-odds}\right) = w_1 x_1+ w_2 x_2 + \cdots + w_n x_n
$$

Loss function: $$\text{Log Loss} = \sum_{(x,y)\in D} -y\log(y') - (1 - y)\log(1 - y')$$

Here we consider so calles Elastic Logarithmic regression, with penalties for I would like to use Penalty l2 means a restriction for coefficient growth. It helps with controlling outlier influence and to avoid overfitting.

Penalty l1 drops useless for prediction variables. Regretfully we do not have explicit information on what variables are dropped.

The code below takes about 15-16 minutes, and comes with a warning that the process did not converge. We still can use the model. 

In [None]:
%%time

from sklearn.linear_model import LogisticRegression
modelLG = LogisticRegression(penalty = 'elasticnet', solver ='saga', C=0.2,  l1_ratio =.9, max_iter=200, random_state = 0)
modelLG.fit(X, df['sports'])

In [None]:
predictions = modelLG.predict(X_test)

In [None]:
coefficients = modelLG.coef_  # see https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
print(type(coefficients))
coefficients.shape

In [None]:
dropped_features = (coefficients == 0)[0, :]
np.sum(dropped_features)

In [None]:
vocab_order = dtm.vocabulary_  # see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
type(vocab_order)

In [None]:
for key in vocab_order:
    if key[:3]=="abe":
        print("For the word '"+ key + "' its column number is ", vocab_order[key])


What words were deemed useless?

In [None]:
dropped_features.shape
dropped_word_indices = np.arange(0, coefficients.shape[1])[dropped_features]
dropped_word_indices[:33]

In [None]:
%%time

dropped_words=[]
for key in vocab_order:
    if int(vocab_order[key]) in dropped_word_indices:
        dropped_words.append(key)

print(dropped_words)  

## Random Forest method

This was quick and with good accuracy.

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
%%time

modelRF = RandomForestClassifier(n_estimators=5000, n_jobs =6)
modelRF.fit(X, df['sports'])

In [None]:
important_features = modelRF.feature_importances_ # see https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
important_features.shape

What is the most useful word for Random Forest method?

In [None]:
np.argmax(important_features)

In [None]:
for key in vocab_order:
    if int(vocab_order[key]) == np.argmax(important_features):
        print("The most useful word is '"+key+"'.")


In [None]:
for key in vocab_order:
    if int(vocab_order[key]) == np.argmin(important_features):
        print("The least useful word is '"+key+"'.")


___
Here is an alert when your script finished running, where 500 is the frequency in Herz and 2000 is the duration in miliseconds.

In [None]:
import winsound
winsound.Beep(500, 2000)