In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import  RandomForestRegressor
import lightgbm as lgb
import xgboost as xgb

In [2]:
train_data = pd.read_csv('./data/Train.csv')
test_data = pd.read_csv('./data/Test.csv')

In [3]:
train_data.head()

Unnamed: 0,Item_Id,Date,State_of_Country,Market_Category,Product_Category,Grade,Demand,Low_Cap_Price,High_Cap_Price
0,IT_1,2007-07-05,0,0,0,0,0.5,2785,4240
1,IT_2,2007-07-05,0,1,0,0,0.7,3574,4756
2,IT_3,2007-07-05,0,103,0,1,1.6,5978,9669
3,IT_4,2007-07-05,0,103,0,0,0.0,5681,8313
4,IT_5,2007-07-05,0,103,0,2,0.0,4924,7257


In [4]:
test_data.head()

Unnamed: 0,Item_Id,Date,State_of_Country,Market_Category,Product_Category,Grade,Demand,High_Cap_Price
0,IT_265079,2014-01-19,0,3,0,0,0.1,5303
1,IT_265087,2014-01-19,0,268,0,2,1.4,10492
2,IT_265107,2014-01-19,0,320,0,0,13.4,24706
3,IT_265139,2014-01-19,17,358,0,3,5.5,23464
4,IT_265142,2014-01-19,18,23,5,2,0.0,6222


In [5]:
train_data.drop(['Date'],axis = 1, inplace = True)
train_data.drop(['Item_Id'], axis = 1, inplace = True)

In [6]:
X = train_data.drop('Low_Cap_Price', axis =1)
y = train_data['Low_Cap_Price']

In [7]:
rf_model= RandomForestRegressor()
lgb_model = lgb.LGBMRegressor()
xgb_model = xgb.XGBRegressor()

In [8]:
lgbm_random_grid = { 'n_estimators' : [int(x) for x in np.linspace(start=100,stop=1400,num=14)],
                'max_features' : ['auto','sqrt', 'log2'],
                'max_depth' :[int(x) for x in np.linspace(5,30,num=6)],
                'min_samples_split' : [2,5,10,15, 50,75, 100],
                'min_samples_leaf' : [1,2,5,10]
}

In [9]:
xgb_random_grid = { 'n_estimators' : [int(x) for x in np.linspace(start=100,stop=1400,num=14)],
                'max_features' : ['auto','sqrt', 'log2'],
                'max_depth' :[int(x) for x in np.linspace(5,30,num=6)],
                'min_samples_split' : [2,5,10,15,50,75,100],
                'min_samples_leaf' : [1,2,5,10],
                #'objective': ["count:poisson","reg:squarederror", "reg:squaredlogerror", "reg:gamma", "reg:tweedie"]
}

In [10]:
rf_random_grid = { 'n_estimators' : [int(x) for x in np.linspace(start=100,stop=1400,num=14)],
                'max_features' : ['auto','sqrt', 'log2'],
                'max_depth' :[int(x) for x in np.linspace(5,30,num=6)],
                'min_samples_split' : [2,5,10,15,50,75,100],
                'min_samples_leaf' : [1,2,5,10]
}

In [11]:
lgbm_random = RandomizedSearchCV(estimator=lgb_model, param_distributions=lgbm_random_grid,scoring='neg_mean_squared_error',n_iter=15,cv=5,verbose=2,random_state=777, 
                                 n_jobs=6)
xgb_random = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgb_random_grid,scoring='neg_mean_squared_error',n_iter=15,cv=5,verbose=2,random_state=777,n_jobs=6)
rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_random_grid,scoring='neg_mean_squared_error',n_iter=15,cv=5,verbose=2,random_state=777,n_jobs=6)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2)

In [13]:
%%time
kf = KFold(n_splits=5,random_state=1,shuffle=True)
index = 1
for train_index,test_index in kf.split(X,y):
    print(index)
    index = index+1
    X_train,X_test = X.loc[train_index],X.loc[test_index]
    y_train,y_test = y[train_index],y[test_index]

    lgbm_random.fit(X_train,y_train, categorical_feature  = ['State_of_Country','Market_Category', 'Product_Category'])
    xgb_random.fit(X_train,y_train)
    rf_random.fit(X_train,y_train)

1
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    4.1s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:   11.0s finished
New categorical_feature is ['Market_Category', 'Product_Category', 'State_of_Country']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   47.6s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:  2.1min finished


Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   11.0s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:   44.6s finished


2
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    3.3s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:    9.8s finished
New categorical_feature is ['Market_Category', 'Product_Category', 'State_of_Country']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   44.9s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:  2.0min finished


Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   10.9s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:   44.9s finished


3
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    3.2s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:   10.0s finished
New categorical_feature is ['Market_Category', 'Product_Category', 'State_of_Country']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   42.3s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:  2.0min finished


Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   10.9s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:   44.6s finished


4
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    3.2s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:   11.9s finished
New categorical_feature is ['Market_Category', 'Product_Category', 'State_of_Country']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   43.7s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:  2.1min finished


Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   11.6s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:   46.5s finished


5
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    3.3s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:   10.3s finished
New categorical_feature is ['Market_Category', 'Product_Category', 'State_of_Country']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   43.1s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:  2.0min finished


Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   11.0s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:   44.6s finished


Wall time: 15min 29s


In [14]:
lgbm_random.best_params_

{'n_estimators': 100,
 'min_samples_split': 15,
 'min_samples_leaf': 5,
 'max_features': 'sqrt',
 'max_depth': 15}

In [15]:
xgb_random.best_params_

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 5,
 'max_features': 'log2',
 'max_depth': 5}

In [16]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 15,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 10}

In [17]:
pred1 = rf_random.predict(X_test)
pred2 = lgbm_random.predict(X_test)
pred3 = xgb_random.predict(X_test)

In [18]:
item_id = test_data['Item_Id']

In [19]:
test_data.drop(['Date'], axis = 1, inplace = True)
test_data.drop(['Item_Id'], axis = 1,inplace=True)

In [20]:
y_pred1 = rf_random.predict(test_data)
y_pred2 = lgbm_random.predict(test_data)
y_pred3 = xgb_random.predict(test_data)

In [21]:
y_pred=(y_pred1*0.3+y_pred2*0.4+y_pred3*0.3)

In [22]:
test_data['Low_Cap_Price'] = y_pred
test_data['Item_Id'] = item_id

In [23]:
df_new=pd.DataFrame(test_data,columns={'Item_Id','Low_Cap_Price'})
df_new = df_new[['Item_Id', 'Low_Cap_Price']]
df_new.to_csv(r"./submission.csv",index=False)