In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,StratifiedKFold,KFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder
from sklearn.tree import DecisionTreeRegressor,plot_tree
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss,mean_squared_error,r2_score

In [28]:
crab=pd.read_csv("train.csv",index_col=0)
crab.head()

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


In [29]:
crab=pd.get_dummies(crab,drop_first=True,dtype='int')
crab.head()

Unnamed: 0_level_0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age,Sex_I,Sex_M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9,1,0
1,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8,1,0
2,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9,0,1
3,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11,0,0
4,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8,1,0


In [30]:
X=crab.drop('Age',axis=1)
y=crab['Age']

In [31]:
dtr=DecisionTreeRegressor(random_state=23)

In [32]:
kfold=KFold(n_splits=5,random_state=23,shuffle=True)

In [33]:
params={'max_depth':[2,3,4,5,6,7,8,9,10,None],
       'min_samples_split':[2,5,6,7,8,9,10],
       'min_samples_leaf':[1,3,5,7,10,15]}
gcv=GridSearchCV(dtr,param_grid=params,cv=kfold,scoring='neg_mean_squared_error')
gcv.fit(X,y)

In [34]:
print("best parameter :",gcv.best_params_)
print("best score :",gcv.best_score_)

best parameter : {'max_depth': 8, 'min_samples_leaf': 15, 'min_samples_split': 2}
best score : -4.428276556547479


### INFERENCING

In [37]:
crab_test=pd.read_csv("test.csv",index_col=0)
crab_test=pd.get_dummies(crab_test,drop_first=True,dtype='int')
crab_test.head()

Unnamed: 0_level_0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Sex_I,Sex_M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
74051,1.05,0.7625,0.275,8.618248,3.657085,1.729319,2.721552,1,0
74052,1.1625,0.8875,0.275,15.507176,7.030676,3.246018,3.96893,1,0
74053,1.2875,0.9875,0.325,14.571643,5.556502,3.883882,4.819415,0,0
74054,1.55,0.9875,0.3875,28.377849,13.380964,6.548735,7.030676,0,0
74055,1.1125,0.85,0.2625,11.765042,5.528153,2.466407,3.331066,1,0


In [36]:
best_model = gcv.best_estimator_

In [38]:
y_pred=best_model.predict(crab_test)
y_pred

array([ 7.9787234 ,  7.70704574, 10.22368421, ..., 13.40625   ,
       10.22368421, 12.24096386])

In [39]:
submission=pd.DataFrame({'Id':crab_test.index,'Age':y_pred})
submission

Unnamed: 0,Id,Age
0,74051,7.978723
1,74052,7.707046
2,74053,10.223684
3,74054,9.491897
4,74055,7.510249
...,...,...
49363,123414,7.900000
49364,123415,8.134503
49365,123416,13.406250
49366,123417,10.223684


In [40]:
submission.to_csv("Submit_Crab.csv",index=False)