# Modelling 
Making/training the estimator

### In this notebook  we will be modelling, fine tuning the model and then performing feature engineering.

In [2]:
# standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# import models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

# evaluation metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

# saving and loading the model 
import joblib

### Importing the training data

In [3]:
train_data = pd.read_csv('data/train.csv', parse_dates=['date']) # import the data
train_data.head()

Unnamed: 0,APMC,Commodity,Year,Month,arrivals_in_qtl,min_price,max_price,modal_price,date,district_name,state_name,month_int
0,Ahmednagar,Bajri,2015,April,79.0,1406.0,1538.0,1463.0,2015-04-01,Ahmadnagar,Maharashtra,4
1,Ahmednagar,Bajri,2016,April,106.0,1788.0,1925.0,1875.0,2016-04-01,Ahmadnagar,Maharashtra,4
2,Ahmednagar,Wheat(Husked),2015,April,1253.0,1572.0,1890.0,1731.0,2015-04-01,Ahmadnagar,Maharashtra,4
3,Ahmednagar,Wheat(Husked),2016,April,387.0,1750.0,2220.0,1999.0,2016-04-01,Ahmadnagar,Maharashtra,4
4,Ahmednagar,Sorgum(Jawar),2015,April,3825.0,1600.0,2200.0,1900.0,2015-04-01,Ahmadnagar,Maharashtra,4


In [15]:
# the data is not numeric we need to make it numberic to pass it through the estimator
cat_cols = ['APMC', 'Commodity', 'Month', 'Year', 'district_name', 'state_name'] # these are the categorical columns

for col_label in cat_cols:
    train_data[col_label] = train_data[col_label].astype('category')

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52076 entries, 0 to 52075
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   APMC             52076 non-null  category      
 1   Commodity        52076 non-null  category      
 2   Year             52076 non-null  category      
 3   Month            52076 non-null  category      
 4   arrivals_in_qtl  52076 non-null  float64       
 5   min_price        52076 non-null  float64       
 6   max_price        52076 non-null  float64       
 7   modal_price      52076 non-null  float64       
 8   date             52076 non-null  datetime64[ns]
 9   district_name    52076 non-null  category      
 10  state_name       52076 non-null  category      
 11  month_int        52076 non-null  int64         
dtypes: category(6), datetime64[ns](1), float64(4), int64(1)
memory usage: 2.8 MB


In [36]:
# spliting the data into features and labels
X = train_data.drop(['month_int', 'min_price', 'max_price', 'date'], axis=1)  # remove the non feature columns
Y = train_data[['min_price', 'max_price']].copy()  # set the labels

In [37]:
for label, col in X.items():  # replace the strings with the codes
    if pd.api.types.is_categorical_dtype(col):
        X[label] = col.cat.codes
        
X.head()

Unnamed: 0,APMC,Commodity,Year,Month,arrivals_in_qtl,modal_price,district_name,state_name
0,3,6,1,0,79.0,1463.0,0,0
1,3,6,2,0,106.0,1875.0,0,0
2,3,194,1,0,1253.0,1731.0,0,0
3,3,194,2,0,387.0,1999.0,0,0
4,3,166,1,0,3825.0,1900.0,0,0


In [18]:
# let's import the test set also to test our model on.
test_data = pd.read_csv('data/test.csv', parse_dates=['date'])
test_data.head()

Unnamed: 0,APMC,Commodity,Year,Month,arrivals_in_qtl,min_price,max_price,modal_price,date,district_name,state_name
0,Solapur,Capsicum,2015,January,203.0,979.0,2112.0,1485.0,2015-01-01,Solapur,Maharashtra
1,Solapur,Capsicum,2016,January,167.0,1658.0,2928.0,2260.0,2016-01-01,Solapur,Maharashtra
2,Solapur,Mula Shenga,2015,January,54.0,1416.0,2984.0,2097.0,2015-01-01,Solapur,Maharashtra
3,Solapur,Mula Shenga,2016,January,121.0,1275.0,2731.0,2025.0,2016-01-01,Solapur,Maharashtra
4,Solapur,Ridge Gourd,2015,January,79.0,1138.0,4063.0,2538.0,2015-01-01,Solapur,Maharashtra


In [19]:
# converting the test data columns to category
cat_cols = ['APMC', 'Commodity', 'Month', 'Year', 'district_name', 'state_name'] 

for col_label in cat_cols:
    test_data[col_label] = test_data[col_label].astype('category')

test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   APMC             10000 non-null  category      
 1   Commodity        10000 non-null  category      
 2   Year             10000 non-null  category      
 3   Month            10000 non-null  category      
 4   arrivals_in_qtl  10000 non-null  float64       
 5   min_price        10000 non-null  float64       
 6   max_price        10000 non-null  float64       
 7   modal_price      10000 non-null  float64       
 8   date             10000 non-null  datetime64[ns]
 9   district_name    10000 non-null  category      
 10  state_name       10000 non-null  category      
dtypes: category(6), datetime64[ns](1), float64(4)
memory usage: 491.6 KB


In [20]:
# split the test data
X_test = test_data.drop(['min_price', 'max_price', 'date'], axis=1) 
Y_test = test_data[['min_price', 'max_price']].copy()  

In [35]:
for label, col in X_test.items():
    if pd.api.types.is_categorical_dtype(col):
        X_test[label] = col.cat.codes
        
X_test.head()

Unnamed: 0,APMC,Commodity,Year,Month,arrivals_in_qtl,modal_price,district_name,state_name
0,287,54,1,4,203.0,1485.0,28,0
1,287,54,2,4,167.0,2260.0,28,0
2,287,167,1,4,54.0,2097.0,28,0
3,287,167,2,4,121.0,2025.0,28,0
4,287,208,1,4,79.0,2538.0,28,0


#### Fitting the models

In [42]:
models = {
    "RandomForest": RandomForestRegressor(),
    'LinearRegression': LinearRegression(),
#     'SupportVectorRegressor': SVR(),  # support vector machines do not support mulitple regression
#     'SupportVectorRegressor(linear)': SVR(kernel='linear')
}

In [43]:
scores = {}

for name, model in models.items():
    print(f"fitting {name}")
    model.fit(X, Y)
    scores[name] = model.score(X_test, Y_test)

fitting RandomForest
fitting LinearRegression


In [44]:
scores

{'RandomForest': 0.9180554197289613, 'LinearRegression': 0.9291048975373328}

In [45]:
rf_reg = models['RandomForest']
y_preds = rf_reg.predict(X_test)

print(f"RandomForest: mae:{mean_absolute_error(y_preds, Y_test)}, rmse: {np.sqrt(mean_squared_error(y_preds, Y_test))}")

RandomForest: mae:455.1827400000002, rmse: 872.2809202565649


In [46]:
lr_reg = models['LinearRegression']
y_preds = lr_reg.predict(X_test)

print(f"LinearRegression: mae:{mean_absolute_error(y_preds, Y_test)}, rmse: {np.sqrt(mean_squared_error(y_preds, Y_test))}")

LinearRegression: mae:427.25335766893033, rmse: 805.2225832676394


In [55]:
%%time
# Let's try to fine tune the Random Forest
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators':[10, 50, 100, 200],
    'max_depth':[None, 3, 5],
    'min_samples_split':[2, 6, 8],
    'min_samples_leaf':[1, 4, 8],
    'max_features': [0.5, 1, 'sqrt', 'auto'],
}

gs_reg = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, verbose=2, scoring='neg_mean_squared_error', n_jobs=-1)
gs_reg.fit(X, Y)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


KeyboardInterrupt: 