In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import classification_report


DATA_FOLDER = os.path.join('..','..','..', 'data', 'workshop_1')
def resolve_path(*path):
    return os.path.join(DATA_FOLDER,*path)

In [78]:
df_s=pd.DataFrame.from_csv(resolve_path('csv','problems_source.csv'))
df=df_s.sample(n=int(len(df_s)*.05), replace=True, random_state=42)
df=df.append(df_s[df_s['status']=='Не удовлетворён'])
# del df_s

In [79]:
df.head()

Unnamed: 0_level_0,address,building,category,city_object,district,is_public,latitude,longitude,municipality,reason,status,type_of,url
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
699906,"город Пушкин, Конюшенная улица, дом 37, литера А",92194.0,19,1,35,True,59.720988,30.415632,160.0,210,Получен ответ,building,/facilities/92194/problems/
149846,"г.Санкт-Петербург, проспект Героев, дом 24, ко...",139731.0,8,1,16,False,59.861226,30.166544,111.0,26,Отклонено,building,
689932,"г.Санкт-Петербург, Крюкова улица, дом 11, лите...",76648.0,10,1,11,False,59.961808,30.417959,105.0,35,Получен ответ,building,
455840,"г.Санкт-Петербург, набережная реки Фонтанки, д...",84097.0,10,1,32,False,59.916693,30.293292,66.0,119,Получен ответ,building,
562500,"г.Санкт-Петербург, набережная Крюкова канала, ...",57528.0,10,1,32,True,59.918182,30.300352,66.0,189,Получен ответ,building,/facilities/57528/problems/


In [80]:

def transform(df):
    def fill_none(df, def_value=0, columns=['building', 'municipality']):
        df = df.fillna(dict((el, def_value) for el in columns))
        return df

    for c in ['is_public','status']:
        df[c] = df[c].astype('category')

    status = dict( enumerate(df['status'].cat.categories) )
    df[['is_public','status']] = df[['is_public','status']].apply(lambda x: x.cat.codes)
    df=fill_none(df)
    return df,status

df,status=transform(df)


In [109]:
features=['is_public','building','category','city_object','district','municipality','reason','latitude','longitude']  

X = df[features]
y = df['status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)


In [110]:
X.head()

Unnamed: 0_level_0,is_public,building,category,city_object,district,municipality,reason,latitude,longitude
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
699906,1,92194.0,19,1,35,160.0,210,59.720988,30.415632
149846,0,139731.0,8,1,16,111.0,26,59.861226,30.166544
689932,0,76648.0,10,1,11,105.0,35,59.961808,30.417959
455840,0,84097.0,10,1,32,66.0,119,59.916693,30.293292
562500,1,57528.0,10,1,32,66.0,189,59.918182,30.300352


In [111]:
param_grid = { 
    'n_estimators': [10,100],
    'max_depth':[10,100] 
}

CV_rfc = GridSearchCV(estimator=RandomForestClassifier(criterion='entropy', random_state=42,n_jobs=-1), param_grid=param_grid, cv=5,n_jobs=-1)


In [112]:
%time
CV_rfc.fit(X_train,y_train)

Wall time: 0 ns




GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 100], 'max_depth': [10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [96]:
status

{0: 'Модерация',
 1: 'Не удовлетворён',
 2: 'Отклонено',
 3: 'Получен ответ',
 4: 'Промежуточный ответ',
 5: 'Рассмотрение'}

In [113]:
print(classification_report(y_test, CV_rfc.best_estimator_.predict(X_test)))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00        19
          2       0.67      0.38      0.48      1628
          3       0.82      0.97      0.89      9586
          4       0.55      0.07      0.12       718
          5       0.67      0.02      0.03       350

avg / total       0.78      0.81      0.77     12301



  'precision', 'predicted', average, warn_for)


In [114]:
list(zip(CV_rfc.best_estimator_.feature_importances_.tolist(),X.columns))

[(0.25989751382712345, 'is_public'),
 (0.0753534055083335, 'building'),
 (0.11844441449514796, 'category'),
 (0.07861404739423979, 'city_object'),
 (0.03669725033608562, 'district'),
 (0.06778650335529969, 'municipality'),
 (0.19418320645277973, 'reason'),
 (0.08682835935856109, 'latitude'),
 (0.08219529927242927, 'longitude')]

------

In [94]:
from ipywidgets import interact
from IPython.display import display
import ipywidgets as widgets
import json


In [98]:
val='''{
   "id":833143,
   "is_public":1,
   "status_name":"Рассмотрение",
   "latitude":"59.9727955",
   "longitude":"30.3402690",
   "full_address":"г.Санкт-Петербург, Большой Сампсониевский проспект, дом 59, литера А",
   "building":19485,
   "district":36,
   "municipality":14,
   "reason":210,
   "category":19,
   "city_object":1
}'''
new=pd.DataFrame().append(json.loads(val),ignore_index=True)

In [103]:
CV_rfc.best_estimator_.predict(new[features]).tolist()

[3]

In [106]:
list(zip(CV_rfc.best_estimator_.predict_proba(new[features])[0].tolist(),status.values() ))

[(0.0, 'Модерация'),
 (0.0, 'Не удовлетворён'),
 (0.01552040808700545, 'Отклонено'),
 (0.7858753072512391, 'Получен ответ'),
 (0.1602507492580307, 'Промежуточный ответ'),
 (0.038353535403724666, 'Рассмотрение')]