In [1]:
import math
import warnings
from imp import IMP_HOOK

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import xgboost as xgb
from imblearn.over_sampling import (ADASYN, SMOTE, SMOTEN, SMOTENC, SVMSMOTE,
                                    BorderlineSMOTE, KMeansSMOTE,
                                    RandomOverSampler)
from imblearn.pipeline import Pipeline as Pipeline_Imb
from imblearn.under_sampling import (AllKNN, CondensedNearestNeighbour,
                                     EditedNearestNeighbours, NearMiss,
                                     OneSidedSelection, RandomUnderSampler,
                                     RepeatedEditedNearestNeighbours,
                                     TomekLinks)
from lightgbm import LGBMRegressor
from sklearn import metrics
# imports
from sklearn.datasets import load_digits
from sklearn.decomposition import (PCA, FactorAnalysis, FastICA,
                                   IncrementalPCA, KernelPCA)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import (AdaBoostClassifier, AdaBoostRegressor,
                              BaggingRegressor, ExtraTreesRegressor,
                              GradientBoostingClassifier,
                              GradientBoostingRegressor,
                              RandomForestClassifier, RandomForestRegressor)
from sklearn.feature_selection import (RFE, RFECV, SelectFromModel,
                                       SelectKBest, SelectPercentile,
                                       SequentialFeatureSelector, chi2,
                                       f_classif, f_oneway, f_regression,
                                       mutual_info_classif,
                                       mutual_info_regression)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (accuracy_score, mean_absolute_error,
                             mean_squared_error, pairwise_distances, r2_score)
from sklearn.model_selection import (GridSearchCV, GroupShuffleSplit, KFold,
                                     RepeatedKFold, RepeatedStratifiedKFold,
                                     cross_val_score, cross_validate,
                                     train_test_split)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, OneHotEncoder,
                                   StandardScaler)
from sklearn.svm import SVC
from sklearn.tree import (DecisionTreeClassifier, DecisionTreeRegressor,
                          ExtraTreeClassifier, ExtraTreeRegressor)
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant
from statsmodels.tools.tools import add_constant
from sympy import im
from utils_metrics import utils_Imb_binary_metrics
from xgboost import XGBRegressor

warnings.filterwarnings("ignore")


***
## Start Section of API's defined for project
***

In [2]:
def LabelEncoder_Local(data, category_columns):
    """For a feature, get unique classes, replace classes with their total count in feaure
       Since numeric feaures are binary, replace these numeric counts with MinMaxScalar (0-1 range)

    Args:
        data ([dataframe]): [dataframe containing categorical columns]
        category_columns ([array-like, str]): [categorical column names]
    
    Returns:
        new dataframe with LabelEncodeds
    """
       
    for col in category_columns:
        data[col] = data[col].map(data[col].value_counts().to_dict())
       
    
    scalar = MinMaxScaler()
    data.loc[:, category_columns] = scalar.fit_transform( data.loc[:, category_columns])       
        
    return data

In [3]:
def get_imp_features_tree_regresors(X_df,y):
    pipeline = Pipeline(
                    [('feat_select', SelectKBest()),
                    ('lgbm', LGBMRegressor())
                        
    ])

    parameters = {}
    parameters['feat_select__k'] = [50, 75, 100, 125, 150, 175, 200]
    parameters['feat_select__score_func'] = [mutual_info_regression, f_regression]


    CV = GridSearchCV(pipeline, parameters, scoring = ['neg_mean_absolute_error', 'r2'], refit='r2' ,n_jobs= 1)
    CV.fit(X_df.values, y)  

    # print('Best score and parameter combination = ')

    # print(CV.best_score_)    
    # print(CV.best_params_) 
    sel = SelectKBest(CV.best_params_['feat_select__score_func'], k=CV.best_params_['feat_select__k']).fit(X_df, y)
    best_features = X_df.columns.values[sel.get_support()]
    return best_features

    # y_pred = CV.predict(Xtest_1.values)
    # print('MAE on validation set: %s' % (round(mean_absolute_error(ytest_1, y_pred), 5)))
    # print('r2_score on validation set: ', r2_score(ytest_1, y_pred))

In [4]:
def get_imp_features_pairwise_dist(X_df, y):
    # find similarity amongst features....we take lesser values features which represent variation amongst features
    feature_similarity = pairwise_distances(X_df.T, metric = 'cosine')
    feature_similarity = pd.DataFrame(feature_similarity, columns=X_df.T.index, index=X_df.T.index)
    feature_similarity < 0.8
    df_pairwise = feature_similarity[feature_similarity < 0.8].dropna(axis=0, thresh=int(feature_similarity.shape[0]/3))
    df_pairwise = df_pairwise.dropna(axis=1, thresh=15)
    df_pairwise = df_pairwise[df_pairwise < 0.5].dropna(axis=1, how='all')
    best_features = np.array(list(set(df_pairwise.columns.values).union(df_pairwise.index.values)), dtype='object')
    return best_features
    

In [5]:
def get_imp_features_RandomForest(X_df, y):
    # feature selection using RandomForest
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor(random_state=1, max_depth=3)
    model.fit(X_df.values,y)
    importances = model.feature_importances_
    indices = np.argsort(importances)[-30:]
    best_features = [X_df.columns.values[i] for i in indices] 
    return np.array(best_features, dtype='object')

In [6]:
def get_imp_features_RFE(X_df, y):
    lreg = LGBMRegressor()
    rfe = RFE(lreg, 50)
    rfe.fit(X_df.values, y)
    return X_df.columns.values[rfe.get_support()]


In [7]:
def get_imp_features_PCA(X_df, y):
    pca = PCA(2)
    pca.fit(X_df)
    
    # take top 10 features from PCA1 and PCA2
    PC1 = pca.components_[0]
    idx = np.argsort(PC1)
    top_features_PC1 = X_df.columns.values[idx][::-1][:99]
    
    PC2 = pca.components_[1]
    idx = np.argsort(PC2)
    top_features_PC2 = X_df.columns.values[idx][::-1][:99]
    
    return np.array(list((set(top_features_PC1).union(top_features_PC2))))
    

***
## End Section of API's defined for project
***

In [8]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print( 'no of columns train:', len(train.columns), '\nno of columns in test:', len(test.columns))
train.head(20)

no of columns train: 378 
no of columns in test: 377


Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0
5,18,92.93,t,b,e,c,d,g,h,s,...,0,0,1,0,0,0,0,0,0,0
6,24,128.76,al,r,e,f,d,f,h,s,...,0,0,0,0,0,0,0,0,0,0
7,25,91.91,o,l,as,f,d,f,j,a,...,0,0,0,0,0,0,0,0,0,0
8,27,108.67,w,s,as,e,d,f,i,h,...,1,0,0,0,0,0,0,0,0,0
9,30,126.99,j,b,aq,c,d,f,a,e,...,0,0,1,0,0,0,0,0,0,0


In [9]:
test.head(20)

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0
5,8,y,aa,ai,e,d,x,g,s,0,...,1,0,0,0,0,0,0,0,0,0
6,10,x,b,ae,d,d,x,d,y,0,...,0,0,0,0,0,1,0,0,0,0
7,11,f,s,ae,c,d,h,d,a,0,...,0,0,1,0,0,0,0,0,0,0
8,12,ap,l,s,c,d,h,j,n,0,...,0,0,0,0,0,0,0,0,0,0
9,14,o,v,as,f,d,g,f,v,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
train_categorical = train.select_dtypes('O')
train_numeric = train.select_dtypes('number')
test_categorical = test.select_dtypes('O')
test_numeric = test.select_dtypes('number')

In [11]:
print(train_categorical.columns.difference(test_categorical.columns))
print(train_numeric.columns.difference(test_numeric.columns))

Index([], dtype='object')
Index(['y'], dtype='object')


In [12]:
# column y is not present in test data and should be the target (as also the name suggests)
train_numeric['y'].isna().sum()

0

***
## Check for null and unique values for test and train sets
***

In [13]:
# numerical value insight
train_numerical_insight = pd.concat([train_numeric.isna().sum(), train_numeric.dtypes], axis=1)
train_numerical_insight.columns = ['Nan', 'Obj Type']
train_numerical_insight = pd.concat([train_numerical_insight, train_numeric.describe().T,
                                     pd.Series(train_numeric.apply(lambda x: pd.value_counts(x).to_dict()), name='unique_valcount'),
                                     pd.Series(train_numeric.apply(np.unique), name='unique_val')], axis=1)
train_numerical_insight.head(20)

Unnamed: 0,Nan,Obj Type,count,mean,std,min,25%,50%,75%,max,unique_valcount,unique_val
ID,0,int64,4209.0,4205.960798,2437.608688,0.0,2095.0,4220.0,6314.0,8417.0,"{0: 1, 5600: 1, 5604: 1, 5606: 1, 5611: 1, 561...","[0, 6, 7, 9, 13, 18, 24, 25, 27, 30, 31, 32, 3..."
y,0,float64,4209.0,100.669318,12.679381,72.11,90.82,99.15,109.01,265.32,"{91.88: 7, 89.38: 7, 89.06: 7, 90.76: 7, 89.19...","[72.11, 72.5, 72.94, 73.02, 73.15, 73.24, 73.2..."
X10,0,int64,4209.0,0.013305,0.11459,0.0,0.0,0.0,0.0,1.0,"{0: 4153, 1: 56}","[0, 1]"
X11,0,int64,4209.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,{0: 4209},[0]
X12,0,int64,4209.0,0.075077,0.263547,0.0,0.0,0.0,0.0,1.0,"{0: 3893, 1: 316}","[0, 1]"
X13,0,int64,4209.0,0.057971,0.233716,0.0,0.0,0.0,0.0,1.0,"{0: 3965, 1: 244}","[0, 1]"
X14,0,int64,4209.0,0.42813,0.494867,0.0,0.0,0.0,1.0,1.0,"{0: 2407, 1: 1802}","[0, 1]"
X15,0,int64,4209.0,0.000475,0.021796,0.0,0.0,0.0,0.0,1.0,"{0: 4207, 1: 2}","[0, 1]"
X16,0,int64,4209.0,0.002613,0.051061,0.0,0.0,0.0,0.0,1.0,"{0: 4198, 1: 11}","[0, 1]"
X17,0,int64,4209.0,0.007603,0.086872,0.0,0.0,0.0,0.0,1.0,"{0: 4177, 1: 32}","[0, 1]"


In [14]:
train_numerical_insight.Nan.sum()

0

In [15]:
# categorical train insights
train_categorical_insight = pd.DataFrame({'Nan': train_categorical.isna().sum(),
                                         'Obj Type': train_categorical.dtypes})
train_categorical_insight = pd.concat([train_categorical_insight,
                                       train_categorical.describe().T,
                                       pd.Series(train_categorical.apply(np.unique), name='unique_val')], axis=1)
train_categorical_insight.head(20)

Unnamed: 0,Nan,Obj Type,count,unique,top,freq,unique_val
X0,0,object,4209,47,z,360,"[a, aa, ab, ac, ad, af, ai, aj, ak, al, am, ao..."
X1,0,object,4209,27,aa,833,"[a, aa, ab, b, c, d, e, f, g, h, i, j, k, l, m..."
X2,0,object,4209,44,as,1659,"[a, aa, ac, ae, af, ag, ah, ai, ak, al, am, an..."
X3,0,object,4209,7,c,1942,"[a, b, c, d, e, f, g]"
X4,0,object,4209,4,d,4205,"[a, b, c, d]"
X5,0,object,4209,29,w,231,"[aa, ab, ac, ad, ae, af, ag, ah, c, d, f, g, h..."
X6,0,object,4209,12,g,1042,"[a, b, c, d, e, f, g, h, i, j, k, l]"
X8,0,object,4209,25,j,277,"[a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, ..."


In [16]:
# numerical test insight
test_numeric_insight = pd.concat([test_numeric.isna().sum(), test_numeric.dtypes], axis=1)
test_numeric_insight.columns = ['Nan', 'Obj Type']
test_numeric_insight = pd.concat([test_numeric_insight, 
                                  test_numeric.describe().T, 
                                  pd.Series(test_numeric.apply(lambda x: pd.value_counts(x).to_dict()), name='unique_valcount'),
                                  pd.Series(test_numeric.apply(np.unique), name='unique_val')], axis=1)
test_numeric_insight.head(20)

Unnamed: 0,Nan,Obj Type,count,mean,std,min,25%,50%,75%,max,unique_valcount,unique_val
ID,0,int64,4209.0,4211.039202,2423.078926,1.0,2115.0,4202.0,6310.0,8416.0,"{1: 1, 5592: 1, 5594: 1, 5595: 1, 5596: 1, 559...","[1, 2, 3, 4, 5, 8, 10, 11, 12, 14, 15, 16, 17,..."
X10,0,int64,4209.0,0.019007,0.136565,0.0,0.0,0.0,0.0,1.0,"{0: 4129, 1: 80}","[0, 1]"
X11,0,int64,4209.0,0.000238,0.015414,0.0,0.0,0.0,0.0,1.0,"{0: 4208, 1: 1}","[0, 1]"
X12,0,int64,4209.0,0.074364,0.262394,0.0,0.0,0.0,0.0,1.0,"{0: 3896, 1: 313}","[0, 1]"
X13,0,int64,4209.0,0.06106,0.239468,0.0,0.0,0.0,0.0,1.0,"{0: 3952, 1: 257}","[0, 1]"
X14,0,int64,4209.0,0.427893,0.494832,0.0,0.0,0.0,1.0,1.0,"{0: 2408, 1: 1801}","[0, 1]"
X15,0,int64,4209.0,0.000713,0.026691,0.0,0.0,0.0,0.0,1.0,"{0: 4206, 1: 3}","[0, 1]"
X16,0,int64,4209.0,0.002613,0.051061,0.0,0.0,0.0,0.0,1.0,"{0: 4198, 1: 11}","[0, 1]"
X17,0,int64,4209.0,0.008791,0.093357,0.0,0.0,0.0,0.0,1.0,"{0: 4172, 1: 37}","[0, 1]"
X18,0,int64,4209.0,0.010216,0.10057,0.0,0.0,0.0,0.0,1.0,"{0: 4166, 1: 43}","[0, 1]"


In [17]:
test_numeric_insight.Nan.sum()  

0

In [18]:
# categorical test insights
test_categorical_insight = pd.DataFrame({'Nan': test_categorical.isna().sum(),
                                         'Obj Type': test_categorical.dtypes})
test_categorical_insight = pd.concat([test_categorical_insight,
                                       test_categorical.describe().T,
                                       pd.Series(test_categorical.apply(np.unique), name='unique_val')], axis=1)
test_categorical_insight.head(20)

Unnamed: 0,Nan,Obj Type,count,unique,top,freq,unique_val
X0,0,object,4209,49,ak,432,"[a, ad, ae, af, ag, ai, aj, ak, al, am, an, ao..."
X1,0,object,4209,27,aa,826,"[a, aa, ab, b, c, d, e, f, g, h, i, j, k, l, m..."
X2,0,object,4209,45,as,1658,"[a, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al..."
X3,0,object,4209,7,c,1900,"[a, b, c, d, e, f, g]"
X4,0,object,4209,4,d,4203,"[a, b, c, d]"
X5,0,object,4209,32,v,246,"[a, aa, ab, ac, ad, ae, af, ag, ah, b, c, d, f..."
X6,0,object,4209,12,g,1073,"[a, b, c, d, e, f, g, h, i, j, k, l]"
X8,0,object,4209,25,e,274,"[a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, ..."


In [19]:
test_categorical_insight.Nan.sum()

0

***
## Check for differences in values for categorical in test and train
>### whether categories are same for both train and test
>### check X0,X1,X3...,X8 feature values in both test and train
***

In [20]:
df_test_train_diff_cat = pd.DataFrame(columns=['feature', 'label_not_in_train', 'count_of_labels'])
if train_categorical_insight.index.difference(test_categorical_insight.index).__len__() == 0:
    for feature in train_categorical_insight.index:
        set_val_train = set(train_categorical_insight.loc[feature, 'unique_val'])
        set_val_test = set(test_categorical_insight.loc[feature, 'unique_val'])
        set_val_test.difference_update(set_val_train) # remove all train from test...this means model will face unseen data in test
        #print('labels not present in test :', feature, set_val_test if len(set_val_test) > 0 else None)
        #print('count of such labels: ', test_categorical[feature].value_counts().loc[list(set_val_test)])
        last_row_index = len(df_test_train_diff_cat)
        if len(set_val_test) > 0:
            df_test_train_diff_cat.loc[last_row_index] = [feature, 
                                           list(set_val_test)[:] if len(set_val_test) > 0 else [], 
                                           list(test_categorical[feature].value_counts().loc[list(set_val_test)])
                                           ]
print("Following features in test has labels that are not present in train. Model may not make good predictions for them")
df_test_train_diff_cat.head(20)

Following features in test has labels that are not present in train. Model may not make good predictions for them


Unnamed: 0,feature,label_not_in_train,count_of_labels
0,X0,"[ag, bb, p, ae, an, av]","[1, 1, 1, 1, 1, 1]"
1,X2,"[ad, aj, ax, u, ab, w]","[4, 1, 1, 1, 4, 3]"
2,X5,"[a, b, z, t]","[1, 1, 1, 1]"


***
## Check for value ranges between test and train numeric
***

In [21]:
df_unique = pd.concat([test_numeric_insight.unique_val, test_numeric_insight.unique_val], axis=1)
df_unique['diff'] = df_unique.apply(lambda x: x[1] not in x[0], axis=1)
print("Following features have different ranges of values from test and train: ", df_unique['diff'].sum())

Following features have different ranges of values from test and train:  0


***
## If for any column(s), the variance is equal to zero, then you need to remove those variable(s).
***

In [22]:
train_numerical_insight =  pd.concat([train_numerical_insight, train_numeric.T.apply(np.var, axis=1)], axis=1)
test_numeric_insight = pd.concat([test_numeric_insight, test_numeric.apply(np.var, axis=1)], axis=1)
train_numerical_insight.head(20)

Unnamed: 0,Nan,Obj Type,count,mean,std,min,25%,50%,75%,max,unique_valcount,unique_val,0
ID,0,int64,4209.0,4205.960798,2437.608688,0.0,2095.0,4220.0,6314.0,8417.0,"{0: 1, 5600: 1, 5604: 1, 5606: 1, 5611: 1, 561...","[0, 6, 7, 9, 13, 18, 24, 25, 27, 30, 31, 32, 3...",5940524.0
y,0,float64,4209.0,100.669318,12.679381,72.11,90.82,99.15,109.01,265.32,"{91.88: 7, 89.38: 7, 89.06: 7, 90.76: 7, 89.19...","[72.11, 72.5, 72.94, 73.02, 73.15, 73.24, 73.2...",160.7285
X10,0,int64,4209.0,0.013305,0.11459,0.0,0.0,0.0,0.0,1.0,"{0: 4153, 1: 56}","[0, 1]",0.0131278
X11,0,int64,4209.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,{0: 4209},[0],0.0
X12,0,int64,4209.0,0.075077,0.263547,0.0,0.0,0.0,0.0,1.0,"{0: 3893, 1: 316}","[0, 1]",0.06944063
X13,0,int64,4209.0,0.057971,0.233716,0.0,0.0,0.0,0.0,1.0,"{0: 3965, 1: 244}","[0, 1]",0.05461038
X14,0,int64,4209.0,0.42813,0.494867,0.0,0.0,0.0,1.0,1.0,"{0: 2407, 1: 1802}","[0, 1]",0.2448347
X15,0,int64,4209.0,0.000475,0.021796,0.0,0.0,0.0,0.0,1.0,"{0: 4207, 1: 2}","[0, 1]",0.0004749465
X16,0,int64,4209.0,0.002613,0.051061,0.0,0.0,0.0,0.0,1.0,"{0: 4198, 1: 11}","[0, 1]",0.002606617
X17,0,int64,4209.0,0.007603,0.086872,0.0,0.0,0.0,0.0,1.0,"{0: 4177, 1: 32}","[0, 1]",0.007544954


In [23]:
test_numeric_insight.head(20)

Unnamed: 0,Nan,Obj Type,count,mean,std,min,25%,50%,75%,max,unique_valcount,unique_val,0
0,,,,,,,,,,,,,0.126835
1,,,,,,,,,,,,,0.157446
2,,,,,,,,,,,,,0.152422
3,,,,,,,,,,,,,0.172311
4,,,,,,,,,,,,,0.207916
5,,,,,,,,,,,,,0.291552
6,,,,,,,,,,,,,0.40903
7,,,,,,,,,,,,,0.455152
8,,,,,,,,,,,,,0.513113
9,,,,,,,,,,,,,0.655166


In [24]:
# drop columns if all values are NA in a column
train_numeric = train_numeric.dropna(axis=1, how='all')
test_numeric = test_numeric.dropna(axis=1, how='all')

# drop columns if all values are 0/same in a column.in other words no variance
series_train = train_numeric.apply(np.var)
series_test = test_numeric.apply(np.var)
series_train = list(series_train[series_train == 0].index)
series_test = list(series_test[series_test == 0].index)

print('no var features in train: ', series_train, '\nno var features in test: ', series_test)
print('common no var features in train test: ', set(series_test).intersection(set(series_train)))

train_numeric = train_numeric.drop(series_train, axis=1)
test_numeric = test_numeric.drop(series_test, axis=1)

train_numeric.head(20)

no var features in train:  ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347'] 
no var features in test:  ['X257', 'X258', 'X295', 'X296', 'X369']
common no var features in train test:  set()


Unnamed: 0,ID,y,X10,X12,X13,X14,X15,X16,X17,X18,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,18,92.93,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6,24,128.76,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,25,91.91,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,27,108.67,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,30,126.99,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [25]:
test_numeric.head(20)

Unnamed: 0,ID,X10,X11,X12,X13,X14,X15,X16,X17,X18,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,8,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,11,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,14,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


***
## Feature Engineering before PCA
## 2 sets of Preprocesing techniques for categorical data we use
>## X1, y1 on train using LabelEncoder_local and X2,y2 on train using OneHotEncoder
***

In [26]:
train_categorical_enc = LabelEncoder_Local(train_categorical.copy(), train_categorical.columns.values)

X1 = pd.concat([train_numeric, train_categorical_enc], axis=1)
y1 = X1['y']
X1.drop(['ID', 'y'], axis=1, inplace=True)
X1.head(20)

Unnamed: 0,X10,X12,X13,X14,X15,X16,X17,X18,X19,X20,...,X384,X385,X0,X1,X2,X3,X4,X5,X6,X8
0,0,0,1,0,0,0,0,1,0,0,...,0,0,0.027855,0.487952,0.003016,0.203183,1.0,0.0,0.997087,0.355932
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0.027855,0.033735,0.001809,0.056233,1.0,0.0,0.452427,0.355932
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0.48468,0.059036,0.082027,1.0,1.0,0.004348,0.997087,0.028249
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0.48468,0.033735,0.082027,0.540584,1.0,0.004348,0.452427,0.706215
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0.48468,0.487952,0.082027,0.540584,1.0,0.0,0.595146,0.80226
5,0,0,0,1,0,0,0,0,0,0,...,0,0,0.849582,0.709639,0.048251,1.0,1.0,0.0,0.172816,0.875706
6,0,0,0,1,0,0,0,0,0,0,...,0,0,0.183844,0.298795,0.048251,0.540584,1.0,0.026087,0.172816,0.875706
7,0,0,0,1,0,0,0,0,0,0,...,0,0,0.746518,0.707229,1.0,0.540584,1.0,0.026087,0.997087,0.621469
8,0,0,0,1,0,0,0,0,0,0,...,0,0,0.504178,0.716867,1.0,0.056233,1.0,0.026087,0.462136,0.096045
9,0,0,0,1,0,0,0,0,0,0,...,0,0,0.501393,0.709639,0.037394,1.0,1.0,0.026087,0.18835,0.706215


In [27]:
one_encoder = OneHotEncoder(sparse=False)
one_encoder.fit(train_categorical)
X_categorical_Onehotcoded = one_encoder.transform(train_categorical)
X_categorical_Onehotcoded = pd.DataFrame(X_categorical_Onehotcoded, columns=one_encoder.get_feature_names())

X2 = pd.concat([train_numeric, X_categorical_Onehotcoded], axis=1)
X2.reset_index(drop=True)
y2 = train['y']
X2.drop(['ID', 'y'], axis=1, inplace=True)
X2.head(20)

Unnamed: 0,X10,X12,X13,X14,X15,X16,X17,X18,X19,X20,...,x7_p,x7_q,x7_r,x7_s,x7_t,x7_u,x7_v,x7_w,x7_x,x7_y
0,0,0,1,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


***
## Now we do Feature Extractiion using different methods (get important features)
***

In [28]:
imp_featset_1 = get_imp_features_pairwise_dist(X1,y1)
imp_featset_2 = get_imp_features_RandomForest(X1, y1)
imp_featset_3 = get_imp_features_tree_regresors(X1, y1)
imp_featset_4 = get_imp_features_RFE(X1, y1)
imp_featset_5 = get_imp_features_PCA(X1, y1)
final_features = list(set(np.append(imp_featset_1, imp_featset_2)))
final_features = list(set(np.append(final_features, imp_featset_3)))
final_features = list(set(np.append(final_features, imp_featset_4)))
final_imp_features = list(set(np.append(final_features, imp_featset_5)))
#final_imp_features

***
## Train XGBoost 
>## 1. base r2_score without tuning
>## 2. hyperparameter tuning
***

In [29]:
Xtrain_1, Xtest_1, ytrain_1, ytest_1 = train_test_split(X1[final_imp_features], y1, random_state=100, test_size=0.3)

In [30]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
    'eval_metric': 'mae'
}

num_boost_round = 999

dtrain = xgb.DMatrix(Xtrain_1, label=ytrain_1)
dtest = xgb.DMatrix(Xtest_1, label=ytest_1)

In [31]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10,
    verbose_eval = False
)
print("Best MAE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))
y_pred = model.predict(dtest)
mean_absolute_error(y_pred, ytest_1)
print('Base r2_score :',r2_score(ytest_1, y_pred))

Best MAE: 5.30 in 11 rounds
Base r2_score : 0.48604825815657315


***
## We have base r2_score of 0.48. Lets train XGBRegressor in hyperparameter tuning first

***

In [32]:
gridsearch_params = [
    (max_depth, min_child_weight) for max_depth in range(1,20) for min_child_weight in range(1,3) 
]
min_mae = float('Inf')
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    #print("CV with max_depth:{}, min_child_weight: {} ". format(max_depth, min_child_weight))
    
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    cv_results = xgb.cv(params, dtrain, num_boost_round=num_boost_round, seed=42, nfold=5, metrics={'mae'}, early_stopping_rounds=10)
    
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    #print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
        params['max_depth'] = max_depth
        params['min_child_weight'] = min_child_weight

print("Best params: max_depth: {}, min_child_weight: {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

Best params: max_depth: 4, min_child_weight: 2, MAE: 4.7757464


In [33]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(1,9)]
    for colsample in [i/10. for i in range(1,9)]
]
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    # print("CV with subsample={}, colsample={}".format(
    #                          subsample,
    #                          colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    #print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
        params['subsample'] = subsample
        params['colsample_bytree'] = colsample
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

Best params: 0.6, 0.6, MAE: 5.3126238


In [34]:
# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
# We update our parameters
params['eta'] = eta
# Run and time CV

cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics=['mae'],
        early_stopping_rounds=10
        )
# Update best score
mean_mae = cv_results['test-mae-mean'].min()
boost_rounds = cv_results['test-mae-mean'].argmin()
print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
if mean_mae < min_mae:
    min_mae = mean_mae
    best_params = eta
    params['eta'] = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

CV with eta=0.3
CV with eta=0.2
CV with eta=0.1
CV with eta=0.05
CV with eta=0.01
CV with eta=0.005
	MAE 5.227727 for 948 rounds

Best params: 0.005, MAE: 5.227727


In [35]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10,
    verbose_eval = False
)
print("Best MAE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))
y_pred = model.predict(dtest)
print('MAE: ', mean_absolute_error(y_pred, ytest_1))
print('r2_score  :',r2_score(ytest_1, y_pred))

Best MAE: 5.66 in 893 rounds
MAE:  5.662934020609489
r2_score  : 0.4457697958741953


***
## Cross validation of hyperparameters
***

In [36]:
param_grid = {key: [value] for key, value in params.items()}
xg = xgb.XGBRegressor(verbosity=0, max_depth = 3, min_child_weight =2,
                        eta = 0.005, subsampe= 0.8, colsample_bytree = 0.8,
                        objective = 'reg:squarederror', eval_metric='mae', learning_rate=0.05,
                        n_estimators=100)
best_xgb = GridSearchCV(xg, param_grid=param_grid, cv=10, verbose=0, n_jobs=-1)

scores = cross_val_score(best_xgb, Xtrain_1, ytrain_1, scoring='r2', cv=10)
print('train r2_scores: ', scores.mean())

scores = cross_val_score(best_xgb, Xtest_1, ytest_1, scoring='r2', cv=10)
print('test r2_scores: ', scores.mean())

train r2_scores:  0.5503746118073789
test r2_scores:  0.4762468425059292


In [37]:
param_grid = {key: [value] for key, value in params.items()}
xg = xgb.XGBRegressor(verbosity=0, max_depth = 2, min_child_weight =2,
                        eta = 0.005, subsampe= 0.4, colsample_bytree = 0.5,
                        objective = 'reg:squarederror', eval_metric='mae', learning_rate=0.05,
                        n_estimators=150)
xg.fit(Xtrain_1, ytrain_1)
y_pred = xg.predict(Xtest_1)
print('MAE: ', mean_absolute_error(ytest_1, y_pred))
print('final r2_score for X1 set of features :',r2_score(ytest_1, y_pred))


MAE:  5.524793805796668
final r2_score for X1 set of features : 0.5060182414281343


***
## XGBoost with second set of feature selected (X2,y2 ..created uisng ONeHotEncoding)  
***

In [38]:
imp_featset_1 = get_imp_features_pairwise_dist(X2,y2)
imp_featset_2 = get_imp_features_RandomForest(X2, y2)
imp_featset_3 = get_imp_features_tree_regresors(X2, y2)
imp_featset_4 = get_imp_features_RFE(X2, y2)
imp_featset_5 = get_imp_features_PCA(X2, y2)
final_features2 = list(set(np.append(imp_featset_1, imp_featset_2)))
final_features2 = list(set(np.append(final_features2, imp_featset_3)))
final_features2 = list(set(np.append(final_features2, imp_featset_4)))
final_imp_features2 = list(set(np.append(final_features2, imp_featset_5)))
#final_imp_features2

In [39]:
Xtrain_2, Xtest_2, ytrain_2, ytest_2 = train_test_split(X2[final_imp_features2], y2, random_state=100, test_size=0.3)

In [40]:
params = {
    # Parameters that we are going to tune.
    'max_depth':2,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    
    'colsample_bytree': 1,
    # Other parameter2
    'objective':'reg:squarederror',
    'eval_metric': 'mae'
}

num_boost_round = 999

dtrain = xgb.DMatrix(Xtrain_2, label=ytrain_2)
dtest = xgb.DMatrix(Xtest_2, label=ytest_2)

In [41]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10,
    verbose_eval = False
)
print("Best MAE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))
y_pred = model.predict(dtest)
print('MAE :', mean_absolute_error(ytest_2, y_pred))
print('Base r2_score :',r2_score(ytest_2, y_pred))

Best MAE: 5.23 in 11 rounds
MAE : 5.543316008869089
Base r2_score : 0.5040527340833084


***
## We have base r2_score of 0.50. Lets train XGBRegressor in hyperparameter tuning first

***

In [42]:
gridsearch_params = [
    (max_depth, min_child_weight) for max_depth in range(1,20) for min_child_weight in range(1,3) 
]
min_mae = float('Inf')
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    #print("CV with max_depth:{}, min_child_weight: {} ". format(max_depth, min_child_weight))
    
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    cv_results = xgb.cv(params, dtrain, num_boost_round=num_boost_round, seed=42, nfold=5, metrics={'mae'}, early_stopping_rounds=10)
    
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    #print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
        params['max_depth'] = max_depth
        params['min_child_weight'] = min_child_weight

print("Best params: max_depth: {}, min_child_weight: {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

Best params: max_depth: 3, min_child_weight: 1, MAE: 4.799137399999999


In [43]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(1,9)]
    for colsample in [i/10. for i in range(1,9)]
]
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    # print("CV with subsample={}, colsample={}".format(
    #                          subsample,
    #                          colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    #print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
        params['subsample'] = subsample
        params['colsample_bytree'] = colsample
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

Best params: 0.8, 0.3, MAE: 5.2067756


In [44]:
# This can take some time…
min_mae = float("Inf")
best_params = None
# for eta in [.3, .2, .1, .05, .01, .005]:
#     print("CV with eta={}".format(eta))
# We update our parameters
params['eta'] = eta
# Run and time CV

cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics=['mae'],
        early_stopping_rounds=10
        )
# Update best score
mean_mae = cv_results['test-mae-mean'].min()
boost_rounds = cv_results['test-mae-mean'].argmin()
print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
if mean_mae < min_mae:
    min_mae = mean_mae
    best_params = eta
    params['eta'] = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

	MAE 5.245188199999999 for 916 rounds

Best params: 0.005, MAE: 5.245188199999999


In [45]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10,
    verbose_eval = False
)
print("Best MAE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))
y_pred = model.predict(dtest)
print('MAE: ', mean_absolute_error(ytest_2, y_pred))
print('final r2_score  :',r2_score(ytest_2, y_pred))

Best MAE: 5.69 in 897 rounds
MAE:  5.6934673081592075
final r2_score  : 0.4447761851511567


***
## Cross validation of hyperparameters
***

In [50]:
param_grid = {key: [value] for key, value in params.items()}
xg = xgb.XGBRegressor(verbosity=0, max_depth = 3, min_child_weight =2,
                        eta = 0.005, subsampe= 0.8, colsample_bytree = 0.8,
                        objective = 'reg:squarederror', eval_metric='mae', learning_rate=0.05,
                        n_estimators=100)
best_xgb = GridSearchCV(xg, param_grid=param_grid, cv=10, verbose=0, n_jobs=-1)

scores = cross_val_score(best_xgb, Xtrain_2, ytrain_2, scoring='r2', cv=10)
print('train r2_scores: ', scores.mean())

scores = cross_val_score(best_xgb, Xtest_2, ytest_2, scoring='r2', cv=10)
print('test r2_scores: ', scores.mean())

train r2_scores:  0.5640631863251038
test r2_scores:  0.45755522431333395


In [47]:
param_grid = {key: [value] for key, value in params.items()}
xg = xgb.XGBRegressor(verbosity=0, max_depth = 2, min_child_weight =2,
                        eta = 0.005, subsampe= 0.4, colsample_bytree = 0.5,
                        objective = 'reg:squarederror', eval_metric='mae', learning_rate=0.05,
                        n_estimators=150)
xg.fit(Xtrain_2, ytrain_2)
y_pred = xg.predict(Xtest_2)
print('MAE: ', mean_absolute_error(ytest_2, y_pred))
print('final r2_score for X2 set of features :',r2_score(ytest_2, y_pred))

MAE:  5.541430651014604
final r2_score for X2 set of features : 0.5055386652015406


***
## Since scores are almost same for both types of feature egieering, lets predict usiing OneHotEncoding technique predict test.csv
***

In [48]:
one_encoder = OneHotEncoder(sparse=False)
one_encoder.fit(test_categorical)
X_categorical_Onehotcoded = one_encoder.transform(test_categorical)
X_categorical_Onehotcoded = pd.DataFrame(X_categorical_Onehotcoded, columns=one_encoder.get_feature_names())

test_csv_data = pd.concat([test_numeric, X_categorical_Onehotcoded], axis=1)
test_csv_data.reset_index(drop=True)
test_csv_data.drop(['ID'], axis=1, inplace=True)
test_csv_data.head()

Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,x7_p,x7_q,x7_r,x7_s,x7_t,x7_u,x7_v,x7_w,x7_x,x7_y
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# take same ONeHotEocnded columns from train.csv featured engineered

test_csv_data['y'] = xg.predict(test_csv_data[Xtrain_2.columns])
test_csv_data.head(20)

Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,x7_q,x7_r,x7_s,x7_t,x7_u,x7_v,x7_w,x7_x,x7_y,y
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,80.198524
1,0,0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,94.00473
2,0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,79.335838
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,79.742821
4,0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,111.884338
5,0,0,0,0,0,0,0,0,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,93.406944
6,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,111.816101
7,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,93.857582
8,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,116.421638
9,0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,94.589806
