Перед вами датасет с показателями поведения абонентов телеком оператора, 
вам предстоит решить задачу классификации и предсказать возрастную группу абонента(в train датасете столбец year_group, это ваш таргет). 
Ответ просьба предоставить в виде test датасета в изначальном виде с добавленным столбцом prediction.
Также просьба предоставить сам код с детальными комментариями хода решения, постарайтесь достичь предельной точности(метрика точности accuracy).
Успехов.

In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
%matplotlib inline



### Tasks: 
1) Отмаштабируем все признаки   
2) Будем понижать к-во признаков   
3) У нас выборка не сбалансирована, будем это учитывать  
4) Есть категориальные признаки  
5) Подберем модель подходящюю под данную задачу  

In [2]:
data = pd.read_csv('./train.csv')
data_test = pd.read_csv('./test.csv')

In [3]:
data.shape

(77252, 488)

In [6]:
data.head()

Unnamed: 0,unique_num,2,3,4,5,6,7,8,9,10,...,478,479,480,481,482,483,484,485,486,487
0,2,2,1,21058,3545,0,13,0.0,0,0,...,1,261tt1,2,6,2549anr12,1,41,91,31,5
1,3,1,1,21058,4222,283,3,0.0,0,0,...,0,328tt1,2,14,3030anr12,0,71,71,31,5
2,6,1,1,21058,4142,1808,31,0.0,0,0,...,0,282tt1,1,14,1633anr12,0,81,81,31,5
3,7,1,0,21058,4124,2185,14,0.0,0,0,...,0,246tt1,2,14,3603anr12,0,38,38,31,5
4,8,1,0,21058,4202,461,21058,0.0,0,0,...,0,246tt1,0,14,1627anr12,0,2,2,31,5


In [7]:
count = data.dtypes.value_counts() 
for i in count.index: 
    print('The number of ', i, 'objects is ', count[i])

('The number of ', dtype('int64'), 'objects is ', 289)
('The number of ', dtype('float64'), 'objects is ', 195)
('The number of ', dtype('O'), 'objects is ', 3)


In [8]:
categorical_columns = data.loc[:, data.dtypes == object].columns

####  Котегориальные фичи с большим к-вом разных значений выбросим, остальные в OneHotVectors

In [9]:
[len(pd.unique(data[col])) for col in data.loc[:, data.dtypes == object].columns]

[218, 353, 3651]

In [10]:
data = data.drop([categorical_columns[2]], axis=1)

In [11]:
categorical_columns = categorical_columns[:2]

In [12]:
label_encoder = LabelEncoder()
for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])
data.head()

Unnamed: 0,unique_num,2,3,4,5,6,7,8,9,10,...,477,478,479,480,481,483,484,485,486,487
0,2,2,1,21058,3545,0,13,0.0,0,0,...,18,1,179,2,6,1,41,91,31,5
1,3,1,1,21058,4222,283,3,0.0,0,0,...,93,0,253,2,14,0,71,71,31,5
2,6,1,1,21058,4142,1808,31,0.0,0,0,...,104,0,202,1,14,0,81,81,31,5
3,7,1,0,21058,4124,2185,14,0.0,0,0,...,93,0,162,2,14,0,38,38,31,5
4,8,1,0,21058,4202,461,21058,0.0,0,0,...,93,0,162,0,14,0,2,2,31,5


In [13]:
onehot_encoder = OneHotEncoder(sparse=False)
encoded_categorical_columns = pd.DataFrame(onehot_encoder.fit_transform(data[categorical_columns]), dtype='int8')
encoded_categorical_columns.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,561,562,563,564,565,566,567,568,569,570
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
encoded_categorical_columns.to_csv('encoded_categorical_columns')

In [36]:
pd.unique(data['year_group'])

#### Имее дело с несбалансированными класами: к-во в первом классе значительно отличается от остальных. 

In [None]:
data.groupby(by='year_group').size().plot.bar()

In [None]:
data = data.drop(categorical_columns, axis=1)

#### Произведем:
- масштабирование и стратификацию нашей выборки

In [18]:
scaler = StandardScaler()
scaler.fit(data, y)
X_scaled  = scaler.transform(data)

#### Выбросим неинформативные признаки

In [26]:
estimator = RandomForestClassifier(class_weight='balanced', n_estimators=100)

In [27]:
estimator.fit(X_scaled, y)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [22]:
not_informative_features = ['{}'.format(i) for i, x in enumerate(estimator.feature_importances_) if x < 1e-04]

In [36]:
np.save('not_informative_features', not_informative_features)

In [30]:
informative_features = list(set(data.columns) - set(not_informative_features))

In [32]:
X_informative = data[informative_features]

In [39]:
X_informative.head()

Unnamed: 0,212,213,210,211,42,452,265,218,219,133,...,356,470,260,353,352,351,350,267,478,125
0,0,58,6665,16521,0.0,0.0,0.0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,95.657895,1,0
1,4,21,2755,3540,0.0,0.0,0.0,0,0,0.0,...,0.0,0.2431,0.0,0.0,0.0,0.0,0.0,59.545455,0,0
2,1,9,1078,6000,0.0,0.0,0.0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,110.315789,0,0
3,0,32,4154,1800,0.0,0.0,0.0,0,0,0.0,...,0.0,0.0,0.0,1.6862,1.6862,0.0,1.1764,56.653846,0,0
4,4,39,9183,19680,0.0,0.013013,0.0,0,0,0.0,...,0.0,0.0,0.0,0.0,3.9215,0.0,0.0,332.0625,0,0


In [40]:
encoded_categorical_columns.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,561,562,563,564,565,566,567,568,569,570
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
data_with_one_hot = pd.concat([X_informative, encoded_categorical_columns], axis=1)

In [28]:
data_with_one_hot.to_csv('data_with_one_hot.csv')

#### Разделим выборку на train и test

In [None]:
X = data_with_one_hot.drop(['year_group'], axis=1)
y = data_with_one_hot['year_group']

In [30]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

#### Сгенерируем новых фич
- Из распределения классов я предположил, что возрастные группы упорядочены, то есть 0<1…<6 или наоборот. А раз так, то можно не классифицировать, а строить регрессию. Она будет работать плохо, но зато её результат можно передать другим алгоритмам для обучения.  
-  Если в данных есть реальная структура (а в данных по абонентам она должна быть), то k-средних её почувствует. 

In [33]:
sgd = SGDRegressor(loss='huber', n_iter=100)
sgd.fit(X_train, y_train)
test  = np.hstack((data_test_with_one_hot, sgd.predict(data_test_with_one_hot)[None].T))
train = np.hstack((X_train, sgd.predict(X_train)[None].T))

In [47]:
k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(train)
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(train)
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(train)
test  = np.hstack((test,  k15.predict(test)[None].T,  k7.predict(test)[None].T,  k3.predict(test)[None].T))
train = np.hstack((train, k15.predict(train)[None].T, k7.predict(train)[None].T, k3.predict(train)[None].T))

In [52]:
np.save('train.csv', train)

In [51]:
np.save('test.csv', test)

#### Используем ЛУЧШИЙ алгоритм

In [61]:
gbm = xgb.XGBClassifier(silent=False, nthread=4, max_depth=10, n_estimators=800, subsample=0.5, learning_rate=0.03, seed=1337)
gbm.fit(train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=800, nthread=4,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1337, silent=False, subsample=0.5)

In [64]:
pickle.dump(gbm,  open('xgboost_model.sav', 'wb'))

In [62]:
bst = gbm.booster()
imps = bst.get_fscore()

#### Проверим результаты на тесте и тренировке

In [33]:
y_predicted_train = optimizer.best_estimator_.predict(X_train)

In [66]:
y_predicted = gbm.predict(test)

In [35]:
accuracy_score(y_train, y_predicted_train)

1.0

In [67]:
accuracy_score(y_test, y_predicted)

0.49965481532619954

In [68]:
print(classification_report(y_test, y_predicted))

             precision    recall  f1-score   support

          1       0.54      0.25      0.35      1289
          2       0.57      0.73      0.64      5594
          3       0.44      0.36      0.39      5143
          4       0.43      0.45      0.44      5766
          5       0.54      0.51      0.53      5384

avg / total       0.50      0.50      0.49     23176



In [70]:
confusion_matrix(y_test, y_predicted)

array([[ 326,  693,  123,   73,   74],
       [ 125, 4072,  728,  404,  265],
       [  50, 1245, 1832, 1485,  531],
       [  30,  550, 1131, 2596, 1459],
       [  68,  639,  395, 1528, 2754]])