In [1]:
import pandas as pd
import numpy as np

import datetime

from tqdm._tqdm_notebook import tqdm_notebook

tqdm_notebook.pandas()

from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_val_score, TimeSeriesSplit, train_test_split
from sklearn.feature_selection import f_classif, mutual_info_classif

import graphviz

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def fillnan(data, years_c):
    for name in years_c:
        #заменяем троеточия
        data[name] = data[name].apply(lambda x: np.nan if x==('..') else float(x))

    return data

def dropempt(data, years_c):
    data = fillnan(data, years_c)
    return data.dropna(thresh=10)

def Clearing(data):
    years_c = [item for ind, item in enumerate(np.array(data.columns)) if item not in ['Country Name', 'Country Code','Series Code','Series Name']]
    print ('Before clear:', len(data))

    data = dropempt(data, years_c)

    print ('After clear:', len(data))
    return data

In [3]:
def Make_region(code, reg='region', by = 'name'):
    #print code
    if code in data_cnt[by].values:
        #print code
        name = (data_cnt[by][data_cnt[by] == code].index.tolist())[0]
        return data_cnt[reg].at[name]
    else:
        return np.nan

def Sorting(data, reg='region'):
    #if data['Country Name'].get_value(len(data)-5) == np.nan:
    #    data=data.drop(data.index[[range(len(data)-5, len(data))]]).reset_index()
    if 'Region' not in data.columns:
        data['Region'] = data['Country Name'].progress_apply(lambda x: Make_region(x, reg))
        data['Region'].dropna()
        return data.reset_index()
    else:
        return data

In [4]:
df = pd.read_excel('data/Data_Extract_From_Gender_Statistics.xlsx', encoding = 'utf8').\
            append(pd.read_excel('data/Data_Extract_From_Health_Nutrition_and_Population_Statistics.xlsx', encoding='utf8')).\
            append(pd.read_excel('data/Data_Extract_From_Millennium_Development_Goals.xlsx', encoding='utf8')).\
            append(pd.read_excel('data/Data_Extract_From_Health_Nutrition_and_Population_Statistics_by_Wealth_Quintile.xlsx',encoding='utf8'))

data_cnt = pd.read_csv('all.csv', encoding='utf8')
df = Clearing(df)
defen = pd.read_excel('data/Data_Extract_From_Gender_Statistics.xlsx', sheet_name=1, encoding = 'utf8').\
               append(pd.read_excel('data/Data_Extract_From_Health_Nutrition_and_Population_Statistics.xlsx', sheet_name=1, encoding='utf8')).\
               append(pd.read_excel('data/Data_Extract_From_Millennium_Development_Goals.xlsx', sheet_name=1, encoding='utf8'))

df = Sorting(df, 'sub-region')
df.drop_duplicates(inplace = True)
df.drop(columns = ['index','2016 [YR2016]'], inplace=True)

Before clear: 398541
After clear: 104445


HBox(children=(IntProgress(value=0, max=104445), HTML(value='')))




In [5]:
years_c = [item for ind, item in enumerate(np.array(df.columns)) if item not in ['Region','Country Name', 'Country Code','Series Code','Series Name', 'index']]

In [6]:
k = KMeans(n_clusters=6)
k.fit(df[df['Series Name'] == 'Life expectancy at birth, total (years)'][years_c].fillna(method='pad'))

df['Life expectancy at birth, total (class)']  = np.nan

j = 0
for i in df[df['Series Name'] == 'Life expectancy at birth, total (years)'].index:
    df.set_value(i, 'Life expectancy at birth, total (class)', k.labels_[j])
    j+=1

  


In [7]:
def makexy(df, country = 'RUS'):
    #Years
    years_c = [item for ind, item in enumerate(np.array(df.columns)) if item not in ['Region','Country Name', 'Country Code','Series Code','Series Name', 'index', 'Life expectancy at birth, total (class)']]
    
    #split country
    df = df[(df['Country Code']== country)].copy()
    df.drop_duplicates(inplace = True)
    
    y = df[df['Series Name'] == 'Life expectancy at birth, total (years)']['Life expectancy at birth, total (class)'].as_matrix()
    if len(y) > 0:
        y = pd.DataFrame([y[0] for i in range(13)])
        y.columns = ['Life expectancy at birth, total (years)']
    else:
        return pd.DataFrame(), pd.DataFrame()
    
    #Transponenting and then adding 
    mask = ((df['Series Name'] != 'Life expectancy at birth, male (years)') & 
            (df['Series Name'] != 'Life expectancy at birth, female (years)') &
            (df['Series Name'] != 'Life expectancy at birth, total (years)'))
    
    X = df[mask][years_c].T
    X.columns = df[(df['Country Code']== country) & mask]['Series Name']
    X.reset_index(inplace = True, drop = True)
    del df
    
    X['Years'] = [int(name.split()[0]) for name in years_c]
    X['Region_code'] = int(Make_region(country,'region-code', by = 'alpha-3'))
    X['Reion_sub_code'] = int(Make_region(country, 'sub-region-code', 'alpha-3'))
    X['Country_code'] = int(Make_region(country, 'country-code', 'alpha-3'))
    return X.reset_index(drop = True).fillna(method='pad'), y

def train_set(df, region = 'Eastern Europe'):
    x_train, y_train = pd.DataFrame(), pd.DataFrame()
    for country in df[df['Region'] == region]['Country Code'].unique():
        if country not in ['FRO']:
            tmp = makexy(df, country)
            x_train = pd.concat([tmp[0], x_train], axis=0).interpolate().fillna(-999)
            y_train = pd.concat([tmp[1], y_train], axis=0).interpolate().fillna(-999)
    return x_train.as_matrix(), y_train.as_matrix(), x_train.columns
    

In [8]:
def Resmaking(df):
    cv = TimeSeriesSplit(n_splits=8)
    for region in tqdm_notebook(df['Region'].dropna().unique()):
        tmp = train_set(df, region)
        model = DecisionTreeClassifier()
        print (region+' score : ', cross_val_score(model, tmp[0], tmp[1], scoring='accuracy',cv=cv).mean())
        
        model.fit(tmp[0], tmp[1])
        
        rez = pd.DataFrame(model.feature_importances_, index = tmp[2]).sort_values(by = [0], ascending=False)
        rez[rez[0]!=0].to_excel('Importance/Varible_class_in_'+str(region)+'.xlsx')
        
        rez = pd.concat([pd.DataFrame(f_classif(tmp[0], tmp[1].ravel())[0], 
                                      columns=['F-value'], index=tmp[2]),
                         pd.DataFrame(f_classif(tmp[0], tmp[1].ravel())[1], 
                                      columns=['p-value'], index=tmp[2])], axis=1)
        
        rez['p-value'] = rez['p-value'].apply(lambda x: round(x, 5))
        
        rez[rez['p-value']<0.01].sort_values(by = ['F-value'], ascending=False).to_excel('Importance/Varible_f_class_in_'+str(region)+'.xlsx')
        
        rez = pd.DataFrame(mutual_info_classif(tmp[0], tmp[1].ravel()),
                           index = tmp[2]).sort_values(by = [0], ascending=False)
        rez[rez[0]!=0].to_excel('Importance/Varible_muatal_class_in_'+str(region)+'.xlsx')
        
        

In [9]:
Resmaking(df)

HBox(children=(IntProgress(value=0, max=22), HTML(value='')))

Southern Asia score :  0.9772727272727273


  f = msb / msw


Northern Africa score :  0.8571428571428571


  f = msb / msw


Middle Africa score :  0.625


  f = msb / msw


South America score :  0.8035714285714286


  f = msb / msw


Western Asia score :  0.7445652173913044


  f = msb / msw


Australia and New Zealand score :  1.0


  msb = ssbn / float(dfbn)


Caribbean score :  0.675


  f = msb / msw


Eastern Europe score :  0.8181818181818181


  f = msb / msw


Central America score :  0.8295454545454546


  f = msb / msw


Western Africa score :  0.5499999999999999


  f = msb / msw


Southern Africa score :  0.7678571428571428


  f = msb / msw


Eastern Africa score :  0.609375


  f = msb / msw


South-Eastern Asia score :  0.40384615384615385


  f = msb / msw


Southern Europe score :  0.6764705882352942


  f = msb / msw


Melanesia score :  0.6785714285714286


  f = msb / msw
  f = msb / msw


Western Europe score :  1.0


  msb = ssbn / float(dfbn)


Northern Europe score :  0.875


  f = msb / msw


Central Asia score :  1.0


  msb = ssbn / float(dfbn)


Eastern Asia score :  0.71875


  f = msb / msw
  f = msb / msw


Northern America score :  0.90625


  f = msb / msw


Polynesia score :  1.0


  msb = ssbn / float(dfbn)


Micronesia score :  0.6875


  f = msb / msw
  f = msb / msw





In [10]:
def Resmaking_for_contr(df, country):
    tmp = list(makexy(df, country))
    tmp.append(tmp[0].columns)
    tmp[0] = np.nan_to_num(tmp[0].as_matrix())
    tmp[1] = np.nan_to_num(tmp[1].as_matrix())
    cv = TimeSeriesSplit(n_splits=4)
    
    model = DecisionTreeClassifier()
    print (country+' score : ', cross_val_score(model, tmp[0], tmp[1], scoring='accuracy',cv=cv).mean())
        
    model.fit(tmp[0], tmp[1])
    rez = pd.DataFrame(model.feature_importances_, index = tmp[2]).sort_values(by = [0], ascending=False)
    rez[rez[0]!=0].to_excel('Importance/Varible_class_in_'+str(country)+'.xlsx')
    
    
    rez = pd.concat([pd.DataFrame(f_classif(tmp[0], tmp[1].ravel())[0], columns=['F-value'], index=tmp[2]),
                     pd.DataFrame(f_classif(tmp[0], tmp[1].ravel())[1], columns=['p-value'], index=tmp[2])],
                    axis=1)
    
    rez['p-value'] = rez['p-value'].apply(lambda x: round(x, 5))
        
    rez[rez['p-value']<0.01].sort_values(by = ['F-value'], ascending=False).to_excel('Importance/Varible_f_class_in_'+str(country)+'.xlsx')
        
    rez = pd.DataFrame(mutual_info_classif(tmp[0], tmp[1].ravel()),
                       index = tmp[2]).sort_values(by = [0], ascending=False)
    
    rez[rez[0]!=0].to_excel('Importance/Varible_muatal_class_in_'+str(country)+'.xlsx')

In [11]:
Resmaking_for_contr(df, 'RUS')

RUS score :  1.0


  msb = ssbn / float(dfbn)
