based on How to not overfit? Kaggle solution https://www.kaggle.com/artgor/how-to-not-overfit

In [2]:
import os
import json
import ast
import time
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

In [3]:
from scipy import stats
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold,  RepeatedStratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection
from sklearn.metrics import  f1_score
from sklearn import linear_model
from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import make_classification, fetch_20newsgroups

In [4]:
!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |███                             | 10kB 16.5MB/s eta 0:00:01[K     |██████▏                         | 20kB 1.9MB/s eta 0:00:01[K     |█████████▎                      | 30kB 2.2MB/s eta 0:00:01[K     |████████████▍                   | 40kB 2.5MB/s eta 0:00:01[K     |███████████████▌                | 51kB 2.5MB/s eta 0:00:01[K     |██████████████████▋             | 61kB 2.8MB/s eta 0:00:01[K     |█████████████████████▊          | 71kB 3.0MB/s eta 0:00:01[K     |████████████████████████▊       | 81kB 3.2MB/s eta 0:00:01[K     |███████████████████████████▉    | 92kB 3.1MB/s eta 0:00:01[K     |███████████████████████████████ | 102kB 3.3MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 3.3MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [5]:
import eli5
from eli5.sklearn import PermutationImportance



In [6]:
import warnings
warnings.filterwarnings('ignore')


### выкачиваем датасет 20newsgroups

In [7]:

categories = ['soc.religion.christian', 'sci.med', 'rec.sport.baseball', 'talk.politics.misc']
class_number = len(categories)

In [8]:
train = fetch_20newsgroups(subset='train',  shuffle=True, categories=categories,  random_state=0, remove='headers')
test = fetch_20newsgroups(subset='test',  shuffle=True, categories=categories, random_state=0, remove='headers')

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
#train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [15]:
count_vect = CountVectorizer(max_df=100, min_df=5)

In [16]:
print(len(train.data))
print(len(test.data))

2255
1501


In [17]:
y_train = train.target
y_test = test.target

In [18]:
X_train = count_vect.fit_transform(train.data)
X_test = count_vect.transform(test.data)

In [19]:
print(X_train.shape)
print(X_test.shape)

(2255, 8197)
(1501, 8197)


In [20]:
vocab = count_vect.vocabulary_
index_to_word = {v:k for k,v in vocab.items()}
index_to_class = {i:n for i,n in enumerate(train.target_names)}

## как оценить примерное качество модели, если тест закрыт?

In [None]:
# обучимся на том, что есть, и оценим качество предсказаний на трейне
model = linear_model.LogisticRegression()
model = model.fit(X_train, y_train)


In [None]:
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

In [None]:
print('result on train: {}'.format(f1_score(y_train, train_preds, average='micro')))
print('result on test: {}'.format(f1_score(y_test,test_preds, average='micro')))

result on train: 0.9991130820399113
result on test: 0.8734177215189873


### Overfitting!

![](https://i.stack.imgur.com/1QU0m.png)

## Cross Validation (+ stratification)

![](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)

### KFold, StratifiedKFold, RepeatedStratifiedKFold

In [10]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=0)
stratified_folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=0)
repeated_folds = RepeatedStratifiedKFold(n_splits=n_fold, n_repeats=20, random_state=0)

In [None]:
for train_index, valid_index in folds.split(X_train, y_train): 
    print("TRAIN:", len(train_index), "TEST:", len(valid_index))
    to_validate = y_train[valid_index]
    cls, counts = np.unique(to_validate, return_counts=True)
    print('value_counts {}\n'.format([(cl, value) for cl, value in zip(cls, counts)]))

TRAIN: 1804 TEST: 451
value_counts [(0, 115), (1, 132), (2, 113), (3, 91)]

TRAIN: 1804 TEST: 451
value_counts [(0, 114), (1, 120), (2, 123), (3, 94)]

TRAIN: 1804 TEST: 451
value_counts [(0, 133), (1, 104), (2, 114), (3, 100)]

TRAIN: 1804 TEST: 451
value_counts [(0, 118), (1, 120), (2, 129), (3, 84)]

TRAIN: 1804 TEST: 451
value_counts [(0, 117), (1, 118), (2, 120), (3, 96)]



In [None]:
for train_index, valid_index in stratified_folds.split(X_train, y_train):
    print("TRAIN:", len(train_index), "TEST:", len(valid_index))
    to_validate = y_train[valid_index]
    cls, counts = np.unique(to_validate, return_counts=True)
    print('value_counts {}\n'.format([(cl, value) for cl, value in zip(cls, counts)]))

TRAIN: 1804 TEST: 451
value_counts [(0, 119), (1, 119), (2, 120), (3, 93)]

TRAIN: 1804 TEST: 451
value_counts [(0, 119), (1, 119), (2, 120), (3, 93)]

TRAIN: 1804 TEST: 451
value_counts [(0, 119), (1, 119), (2, 120), (3, 93)]

TRAIN: 1804 TEST: 451
value_counts [(0, 120), (1, 118), (2, 120), (3, 93)]

TRAIN: 1804 TEST: 451
value_counts [(0, 120), (1, 119), (2, 119), (3, 93)]



In [23]:
def cv_train_model(X, y, X_test, y_test, folds=folds, model=None):
    prediction = np.zeros((len(y_test), class_number))
    valid_scores = []
    #feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        model.fit(X_train, y_train)      
        y_pred_test_proba = model.predict_proba(X_test)
        y_pred_valid = model.predict(X_valid)
        #ipdb.set_trace()
        valid_score = f1_score(y_valid, y_pred_valid, average='micro')
        
        valid_scores.append(valid_score)
        prediction += y_pred_test_proba 
        
    prediction /= (fold_n + 1)
    prediction = np.argmax(prediction, axis=1)
    test_score_proba_sum = f1_score(y_test, prediction, average='micro')
    print('Cross Validation mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(valid_scores), np.std(valid_scores)))
    print("score on test data: {0:.4f}".format(test_score_proba_sum)) 

    return prediction, valid_scores

In [None]:
model = linear_model.LogisticRegression()
prediction_lr, scores = cv_train_model(X_train, y_train, X_test, y_test,  model=model,  folds=folds)

Cross Validation mean score: 0.9441, std: 0.0055.
score on test data: 0.8721


In [None]:
scores

[0.9401330376940134,
 0.9423503325942351,
 0.9379157427937915,
 0.9467849223946785,
 0.9534368070953437]

In [None]:
# попробуем другие разбиения на трейн и валидацию
model = linear_model.LogisticRegression()
prediction_lr_stratified, scores = cv_train_model(X_train, y_train, X_test, y_test, model=model,  folds=stratified_folds)

Cross Validation mean score: 0.9455, std: 0.0108.
score on test data: 0.8734



## ELI5

ELI5 позволяет смотреть на вклад разных признаков 

In [None]:
eli5.show_weights(model, top=50)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+0.705,x3019,,
+0.705,x5540,,
+0.682,x7359,,
+0.673,x2096,,
+0.660,x4177,,
+0.633,x4149,,
+0.604,x6966,,
+0.601,x7041,,
+0.596,x3535,,
+0.593,x5589,,

Weight?,Feature
+0.705,x3019
+0.705,x5540
+0.682,x7359
+0.673,x2096
+0.660,x4177
+0.633,x4149
+0.604,x6966
+0.601,x7041
+0.596,x3535
+0.593,x5589

Weight?,Feature
+0.806,x2680
+0.667,x2502
+0.602,x4969
+0.512,x2370
+0.483,x2677
+0.481,x4749
+0.477,x7601
+0.476,x4744
+0.469,x1223
+0.468,x5250

Weight?,Feature
+0.890,x1643
+0.753,x3618
+0.698,x6560
+0.586,x3148
+0.585,x4553
+0.584,x4191
+0.535,x4140
+0.488,x6837
+0.487,x4178
+0.476,x6350

Weight?,Feature
+1.226,x1648
+0.754,x4252
+0.729,x7611
+0.657,x3747
+0.603,x7342
+0.579,x575
+0.559,x4229
+0.556,x5431
+0.546,x5760
+0.543,x5276


Выберем признаки, вносящие, максимальный вклад, и оставим только их

In [None]:
eli5_weights = eli5.formatters.as_dataframe.explain_weights_df(model)

In [None]:
eli5_weights[eli5_weights.target==1]

Unnamed: 0,target,feature,weight
8198,1,x2680,0.806267
8199,1,x2502,0.666835
8200,1,x4969,0.601545
8201,1,x2370,0.511930
8202,1,x2677,0.482643
...,...,...,...
16391,1,x1648,-0.330190
16392,1,x8103,-0.352920
16393,1,x1643,-0.363525
16394,1,x575,-0.377124


In [None]:
top_features = []
for i in range(class_number):
  class_features = [int(i[1:]) for i in eli5_weights[eli5_weights.target==i].feature if 'BIAS' not in i]
  len(class_features)
  top_features += class_features[:1000]


In [None]:
len(set(top_features))

3742

In [None]:
top_features = list(set(top_features))

In [None]:
X_train_eli5 = X_train[:,top_features]
X_test_eli5 = X_test[:,top_features]

In [None]:
X_train_eli5.shape

(2255, 3742)

In [None]:
eli5_model = linear_model.LogisticRegression()
prediction_lr, _ = cv_train_model(X_train_eli5, y_train, X_test_eli5, y_test, model=eli5_model, folds = folds)

Cross Validation mean score: 0.9463, std: 0.0073.
score on test data: 0.8741


### Permutation importance
другой способ посмотреть на важность признаков  - Permutation importance
* Обучаем модель
* Во время валидации смешиваем случайным образом данные в одной из колонок
* Чем сильнее падает метрика, тем важнее признак

см.: https://www.kaggle.com/dansbecker/permutation-importance

## Grid Search

![](https://www.kdnuggets.com/wp-content/uploads/hyper-parameter-search.jpg)

In [None]:
%%time

lr = linear_model.LogisticRegression()

parameter_grid = {'class_weight' : ['balanced', None],
                  'penalty' : ['l2', 'l1'],
                  'solver' : ['liblinear', 'saga'],
                  'C' : [0.001, 0.01, 0.1, 1.0],
                  'max_iter': [2,10,50]
                 }

grid_search = GridSearchCV(lr, param_grid=parameter_grid, cv=folds, n_jobs=-1, scoring='f1_micro')
grid_search.fit(X_train, y_train)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.9534368070953437
Best parameters: {'C': 0.1, 'class_weight': 'balanced', 'max_iter': 10, 'penalty': 'l2', 'solver': 'liblinear'}
CPU times: user 1.39 s, sys: 114 ms, total: 1.5 s
Wall time: 2min 36s


In [None]:
model = linear_model.LogisticRegression(**grid_search.best_params_)
prediction_lr, scores = cv_train_model(X_train, y_train, X_test, y_test,  model=model, folds = stratified_folds)

Cross Validation mean score: 0.9534, std: 0.0111.
score on test data: 0.8907


### примеры с другими моделями

In [25]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

parameter_grid = {'splitter' : ['best', 'random'],
                  'max_depth' : [2, 3, 4, 5, 10, 20, 30]}

grid_search = GridSearchCV(dtc, param_grid=parameter_grid, cv=folds, scoring='f1_micro', n_jobs=-1)
grid_search.fit(X_train, y_train)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
dtc = DecisionTreeClassifier(**grid_search.best_params_)
prediction_dtc, scores_dtc = cv_train_model(X_train, y_train, X_test, y_test,  model=dtc, folds = stratified_folds)

Best score: 0.6456762749445677
Best parameters: {'max_depth': 30, 'splitter': 'best'}
Cross Validation mean score: 0.6603, std: 0.0126.
score on test data: 0.6522


In [26]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier()

parameter_grid = {'n_neighbors': [2, 3, 5, 10, 20, 30],
                  'weights': ['uniform', 'distance']}

grid_search = GridSearchCV(knc, param_grid=parameter_grid, cv=folds, scoring='f1_micro', n_jobs=-1)
grid_search.fit(X_train, y_train)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
knc = KNeighborsClassifier(**grid_search.best_params_)
prediction_knc, scores_knc = cv_train_model(X_train, y_train, X_test, y_test,  model=knc, folds = stratified_folds)

Best score: 0.6753880266075388
Best parameters: {'n_neighbors': 2, 'weights': 'distance'}
Cross Validation mean score: 0.6745, std: 0.0316.
score on test data: 0.5356


In [29]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

parameter_grid = {'alpha': [1e-06, 1e-04, 0.001, 0.1, 2, 10] }

grid_search = GridSearchCV(bnb, param_grid=parameter_grid, cv=folds, scoring='f1_micro', n_jobs=-1)
grid_search.fit(X_train, y_train)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
mnb = MultinomialNB(**grid_search.best_params_)
prediction_mnb, scores_mnb = cv_train_model(X_train, y_train, X_test, y_test,  model=bnb, folds = stratified_folds)

Best score: 0.9534368070953436
Best parameters: {'alpha': 0.0001}
Cross Validation mean score: 0.9503, std: 0.0120.
score on test data: 0.9094


## Как посмотреть на сами признаки

In [None]:
model.coef_.shape

(4, 8197)

In [None]:
def analyze_features(model, n):
    classes ={}
    for class_index in range(class_number):
        word_weights = [(weight, index_to_word[i]) for i,weight in enumerate(model.coef_[class_index])]
        sorted_word_weights = sorted(word_weights, reverse=True)
        sorted_words = [word for weight,word in sorted_word_weights]
        classes[index_to_class[class_index]] = sorted_words[:n]
    return classes

In [None]:
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
analyze_features(model, 10)

{'rec.sport.baseball': ['fan',
  'jewish',
  'cubs',
  'phillies',
  'stadium',
  'teams',
  'sox',
  'player',
  'pitching',
  'jay'],
 'sci.med': ['effects',
  'cancer',
  'doctor',
  'diagnosed',
  'syndrome',
  'msg',
  'blood',
  'effective',
  'med',
  'physician'],
 'soc.religion.christian': ['heaven',
  'satan',
  'james',
  'clh',
  'romans',
  'understanding',
  'joe',
  'catholic',
  'lord',
  'geneva'],
 'talk.politics.misc': ['clinton',
  'kaldis',
  'address',
  'trial',
  'house',
  'judge',
  'party',
  'tax',
  'optilink',
  'deleted']}