# Random Forest

In [133]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Loading Data

In [134]:
train_set = pd.read_csv('train.csv')
train_set.head(3)

Unnamed: 0,id,title,Rating,maincateg,platform,price1,actprice1,Offer %,norating1,noreviews1,star_5f,star_4f,star_3f,star_2f,star_1f,fulfilled1
0,16695,Fashionable & Comfortable Bellies For Women (...,3.9,Women,Flipkart,698,999,30.13%,38.0,7.0,17.0,9.0,6.0,3,3,0
1,5120,Combo Pack of 4 Casual Shoes Sneakers For Men ...,3.8,Men,Flipkart,999,1999,50.03%,531.0,69.0,264.0,92.0,73.0,29,73,1
2,18391,Cilia Mode Leo Sneakers For Women (White),4.4,Women,Flipkart,2749,4999,45.01%,17.0,4.0,11.0,3.0,2.0,1,0,1


In [135]:
# Loading X_train & y_train

X_train_orig = train_set.drop(['Offer %', 'price1'], axis=1)
print(X_train_orig.shape)  # same as X_test !
X_train_orig.head(2)

(15730, 14)


Unnamed: 0,id,title,Rating,maincateg,platform,actprice1,norating1,noreviews1,star_5f,star_4f,star_3f,star_2f,star_1f,fulfilled1
0,16695,Fashionable & Comfortable Bellies For Women (...,3.9,Women,Flipkart,999,38.0,7.0,17.0,9.0,6.0,3,3,0
1,5120,Combo Pack of 4 Casual Shoes Sneakers For Men ...,3.8,Men,Flipkart,1999,531.0,69.0,264.0,92.0,73.0,29,73,1


In [136]:
# y_train
y_train_offer = train_set['Offer %']
y_train_price = train_set['price1']
y_train_offer.head()
y_train_price.shape

(15730,)

## Encode Columns

In [137]:
X_train_orig.isna().sum()

id              0
title           0
Rating          0
maincateg     526
platform        0
actprice1       0
norating1     678
noreviews1    578
star_5f       588
star_4f       539
star_3f       231
star_2f         0
star_1f         0
fulfilled1      0
dtype: int64

In [138]:
# Filling maincateg NaN using title
def fill_maincateg(df):
    for ind, item in enumerate(df.maincateg):
        # print(item)
        
        # how else to check if item is nan
        if(item!="Men" and item != "Women"):
            # print(df.title[ind])
            #if(df.title[ind].str.contains('Men')):
            if("Men" in df.title[ind]):
                df.loc[ind, "maincateg"] = 'Men'
            else:
                df.loc[ind, "maincateg"] = 'Women'
    print("Done")
    
    return df

In [139]:
train_na_cols = {'norating1': X_train_orig.norating1.mean(), 'noreviews1': X_train_orig.noreviews1.mean()}
train_na_cols

{'norating1': 3057.6607759766143, 'noreviews1': 423.97630675818374}

In [140]:
# for encoding 'train_set'

def encode_train_cols(X):
    # Filling maincateg using title
    #X.maincateg = X.maincateg.fillna('Men' if X.title.str.con)
    fill_maincateg(X)
    
    # Drop "title" & "id" & ratings
    cols_to_drop = ['id', 'title', 'star_5f', 'star_4f', 'star_3f', 'star_2f', 'star_1f']
    X.drop(cols_to_drop, axis=1, inplace=True)
    
    # Handling Missing values
    # replacing with most common value in train set
    X.fillna(train_na_cols, inplace=True)
    
    # OHE "maincateg" & "platform"
    dummy_features = ['maincateg', 'platform']
    X = pd.get_dummies(X, columns=dummy_features)
    
    return X

In [141]:
test_na_cols = {'Rating': X_train_orig.Rating.mean()}
test_na_cols

{'Rating': 4.012873490146164}

In [142]:
# for encoding 'test set'
def encode_test_cols(X):
    # Filling maincateg using title
    fill_maincateg(X)
    
    # Drop "title" & "id" & 'norating1'
    cols_to_drop = ['id', 'title', 'star_5f', 'star_4f', 'star_3f', 'star_2f', 'star_1f']
    X.drop(cols_to_drop, axis=1, inplace=True)
    
    # Handling Missing values
    # replacing with most common value in train set
    X.fillna(test_na_cols, inplace=True)
    
    # OHE "maincateg" & "platform"
    dummy_features = ['maincateg', 'platform']
    X = pd.get_dummies(X, columns=dummy_features)
    
    return X

In [143]:
X_train = encode_train_cols(X_train_orig)
print(X_train.shape)
X_train.head()

Done
(15730, 9)


Unnamed: 0,Rating,actprice1,norating1,noreviews1,fulfilled1,maincateg_Men,maincateg_Women,platform_Amazon,platform_Flipkart
0,3.9,999,38.0,7.0,0,0,1,0,1
1,3.8,1999,531.0,69.0,1,1,0,0,1
2,4.4,4999,17.0,4.0,1,0,1,0,1
3,4.2,724,46413.0,6229.0,1,1,0,0,1
4,3.9,2299,77.0,3.0,1,1,0,0,1


In [144]:
X_train_orig.head()

# cols dropped
# na filled
# ohe left

Unnamed: 0,Rating,maincateg,platform,actprice1,norating1,noreviews1,fulfilled1
0,3.9,Women,Flipkart,999,38.0,7.0,0
1,3.8,Men,Flipkart,1999,531.0,69.0,1
2,4.4,Women,Flipkart,4999,17.0,4.0,1
3,4.2,Men,Flipkart,724,46413.0,6229.0,1
4,3.9,Men,Flipkart,2299,77.0,3.0,1


## Training Model - RF

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor   # shit

In [63]:
y_train_price.head()

0     698
1     999
2    2749
3     518
4    1379
Name: price1, dtype: int64

In [64]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train_price,test_size=0.15, random_state=0)
X_train.shape

(13370, 9)

In [65]:
rf = RandomForestRegressor(n_estimators=20)
rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=20)

In [66]:
print(rf.score(X_train, y_train))
rf.score(X_valid, y_valid)

# very less on valid - overfit

0.9802729275094352


0.9048277538101896

In [68]:
# from sklearn.metrics import accuracy_score
# pred_val = rf.predict(X_valid)
# print(accuracy_score(y_valid, pred_val))

In [69]:
from sklearn.metrics import mean_squared_error

pred_train = rf.predict(X_train)
print("Train: ", np.sqrt(mean_squared_error(y_train, pred_train)))

pred_val = rf.predict(X_valid)
print("Val: ", np.sqrt(mean_squared_error(y_valid, pred_val)))

Train:  91.50795306050239
Val:  196.54385102207354


#### Generate submission file for rf
////////////////////////////

In [70]:
X_test = pd.read_csv('test.csv')
test_id = X_test['id']
print(test_id[:3])

0     2242
1    20532
2    10648
Name: id, dtype: int64


In [71]:
X_test = encode_test_cols(X_test)
X_test.head()

Done


Unnamed: 0,Rating,actprice1,norating1,noreviews1,fulfilled1,maincateg_Men,maincateg_Women,platform_Amazon,platform_Flipkart
0,3.8,999,27928,3543,1,1,0,0,1
1,3.9,499,3015,404,1,0,1,0,1
2,3.9,999,449,52,1,0,1,0,1
3,3.9,2999,290,40,1,1,0,0,1
4,3.9,999,2423,326,0,1,0,0,1


In [72]:
pred_test = rf.predict(X_test)
pred_test[:5]

array([423.72      , 291.47      , 435.85      , 859.5       ,
       399.97142857])

In [73]:
subm_file = pd.DataFrame(test_id)
subm_file['price1'] = pred_test
subm_file.head()

Unnamed: 0,id,price1
0,2242,423.72
1,20532,291.47
2,10648,435.85
3,20677,859.5
4,12593,399.971429


In [74]:
subm_file.to_csv('5_rf.csv', index=False)

**Score: 199** Damn??

///////////////////////////////////////////////////////////////////////////////////////////////

In [75]:
def score(model, title):
    model.fit(X_train, y_train)
    
    print("RMSE for", title, ": ")
    
    pred_train = model.predict(X_train)
    print("Train: ", np.sqrt(mean_squared_error(y_train, pred_train)))

    pred_val = model.predict(X_valid)
    print("Val: ", np.sqrt(mean_squared_error(y_valid, pred_val)))
    
    # print("Accurancy for ", title)
    # print("\tTrain: ", model.score(X_train, y_train))
    # print("\tTest: ", model.score(X_valid, y_valid))

In [76]:
RF = RandomForestRegressor(n_estimators=1000, max_depth=10, random_state=0)
score(RF, "RandomForest")

RMSE for RandomForest : 
Train:  173.29092056470597
Val:  220.28506755619608


#### Generate submission file for RF     
////////////////////////////

In [43]:
# pred_val = RF.predict(X_valid)
# print("Val: ", np.sqrt(mean_squared_error(y_valid, pred_val)))

# means RF is fitted globally _/

Val:  303.9508448774159


In [77]:
pred_test2 = RF.predict(X_test)

In [78]:
subm_file = pd.DataFrame(test_id)
subm_file['price1'] = pred_test2
subm_file.head()

Unnamed: 0,id,price1
0,2242,436.641355
1,20532,293.234288
2,10648,443.537603
3,20677,950.391864
4,12593,412.550063


In [79]:
subm_file.to_csv('5_rf_2.csv', index=False)

**Score: 225** hmm, expected.

///////////////////////////////////////////////

### Ujjwal's RF

In [80]:
SRF = RandomForestRegressor(max_depth=30,max_features=5,min_samples_leaf=1,min_samples_split=2,n_estimators=580,bootstrap=True)
score(SRF, "SRF")

RMSE for SRF : 
Train:  81.18974540842143
Val:  183.916724981528


In [84]:
def gen_subm_file(model):
    X_test = pd.read_csv('test.csv')
    test_id = X_test['id']
    
    X_test = encode_test_cols(X_test)
    
    pred_test = model.predict(X_test)
    
    subm_file = pd.DataFrame(test_id)
    subm_file['price1'] = pred_test
    
    return subm_file

In [86]:
subm_file = gen_subm_file(SRF)
subm_file.to_csv("5_rf_3.csv", index=False)

Done


In [87]:
subm_file = pd.read_csv('5_rf_3.csv')
print(subm_file.isna().sum())
subm_file.head()

id        0
price1    0
dtype: int64


Unnamed: 0,id,price1
0,2242,426.146653
1,20532,295.966915
2,10648,447.401416
3,20677,947.751724
4,12593,399.756108


**Score: 191** best

## Feature Scaling

In [145]:
def normalize(X):
    features = X.columns
    X[features] /= X_train[features].max()
    return X

In [146]:
X_train_norm = normalize(X_train.copy())
X_train_norm.head()

Unnamed: 0,Rating,actprice1,norating1,noreviews1,fulfilled1,maincateg_Men,maincateg_Women,platform_Amazon,platform_Flipkart
0,0.78,0.074005,0.000131,0.000154,0.0,0.0,1.0,0.0,1.0
1,0.76,0.148085,0.001831,0.001518,1.0,1.0,0.0,0.0,1.0
2,0.88,0.370324,5.9e-05,8.8e-05,1.0,0.0,1.0,0.0,1.0
3,0.84,0.053634,0.16006,0.137058,1.0,1.0,0.0,0.0,1.0
4,0.78,0.170309,0.000266,6.6e-05,1.0,1.0,0.0,0.0,1.0


In [110]:
X_train.head()  # should not get normalized

Unnamed: 0,Rating,actprice1,norating1,noreviews1,fulfilled1,maincateg_Men,maincateg_Women,platform_Amazon,platform_Flipkart
0,3.9,999,38.0,7.0,0,0,1,0,1
1,3.8,1999,531.0,69.0,1,1,0,0,1
2,4.4,4999,17.0,4.0,1,0,1,0,1
3,4.2,724,46413.0,6229.0,1,1,0,0,1
4,3.9,2299,77.0,3.0,1,1,0,0,1


In [147]:
X_train_norm.shape  # should be (15730, 9)

(15730, 9)

In [148]:
y_train_offer = y_train_offer.str.replace(r'%', '')
y_train_offer = y_train_offer.astype(float)
y_train_offer.head()

0    30.13
1    50.03
2    45.01
3    15.85
4    40.02
Name: Offer %, dtype: float64

In [149]:
y_train_offer /= 100
y_train_offer.head()
y_train_offer.describe()

count    15730.000000
mean         0.468025
std          0.192687
min          0.000000
25%          0.359400
50%          0.500700
75%          0.601600
max          0.889300
Name: Offer %, dtype: float64

In [150]:
y_train_offer.shape  # should be (15730,)

(15730,)

### Training - after feature scaling

In [151]:
X_train2, X_valid2, y_train2, y_valid2 = train_test_split(X_train_norm,y_train_offer,test_size=0.15, random_state=0)
X_train2.shape

(13370, 9)

In [152]:
def score2(model, title):
    print("fitting the model..")
    model.fit(X_train2, y_train2)
    
    print("RMSE for", title, ": ")
    
    pred_train = model.predict(X_train2)
    print("Train: ", np.sqrt(mean_squared_error(y_train2, pred_train)))

    pred_val = model.predict(X_valid2)
    print("Val: ", np.sqrt(mean_squared_error(y_valid2, pred_val)))

In [117]:
rf2 = RandomForestRegressor(n_estimators=20)
score2(rf2, "RF2")

RMSE for RF2 : 
Train:  0.047781799937709174
Val:  0.1114509153288575


In [118]:
# offer is offer%
def predict_price(offer, test_actprice):
    # offer *= 100
    test_actprice -= (test_actprice * offer)
    return test_actprice

In [126]:
def gen_subm_file2(model):
    X_test = pd.read_csv('test.csv')
    test_id = X_test['id']
    test_actprice = X_test['actprice1']
    
    X_test = encode_test_cols(X_test)
    X_test = normalize(X_test)
    
    pred_test_offer = model.predict(X_test)
#     print("offer: ", pred_test_offer[:5])
    pred_test_price = predict_price(pred_test_offer, test_actprice)
#     print("price: ", pred_test_price[:5])
    
    subm_file = pd.DataFrame(test_id)
    subm_file['price1'] = pred_test_price
    
    return subm_file

In [127]:
subm_file = gen_subm_file2(rf2)
subm_file.to_csv("5_rf_norm.csv", index=False)

Done


In [129]:
subm_file = pd.read_csv('5_rf_norm.csv')
subm_file.head()

Unnamed: 0,id,price1
0,2242,424.082398
1,20532,285.818052
2,10648,400.85874
3,20677,838.085545
4,12593,401.5314


**Score: 197** Why was expecting a significant inc after norm? ;-;

### Norm Train using GridSearchCV

In [153]:
from sklearn.model_selection import GridSearchCV

In [162]:
rfc=RandomForestRegressor(random_state=0)
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,9,10],
    'criterion' :['squared_error']
}
param_grid

{'n_estimators': [200, 500],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [4, 5, 6, 7, 8, 9, 10],
 'criterion': ['squared_error']}

In [None]:
# param_grid = {  'bootstrap': [True], 'max_depth': [5, 10, None], 'max_features': ['auto', 'log2'], 'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15]}

In [163]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
print(CV_rfc.best_params_)
score2(CV_rfc, "CV RF")

RMSE for CV RF : 
Train:  0.12391851669903312
Val:  0.13590428983143404


In [171]:
CV_rfc.best_params_

{'criterion': 'squared_error',
 'max_depth': 10,
 'max_features': 'auto',
 'n_estimators': 500}

In [168]:
subm_file = gen_subm_file2(rf2)
subm_file.to_csv("5_rf_CV.csv", index=False)

Done


In [169]:
subm_file.head()

Unnamed: 0,id,price1
0,2242,424.082398
1,20532,285.818052
2,10648,400.85874
3,20677,838.085545
4,12593,401.5314


**Score: 197**

/////////////////////////////////////////////// ROUGH ////////////////////////////////////////////////

**Does RF not work for regression problem (with float values)**

Yes it does.
**RandomForestRegressor v/s RandomForestClassifier**

In [62]:
rf.classes_

array([  69,   91,   98, ..., 5801, 5984, 5998], dtype=int64)

In [63]:
pred = rf.predict(X_valid)

In [70]:
X_valid.head()

Unnamed: 0,Rating,actprice1,norating1,noreviews1,fulfilled1,maincateg_Men,maincateg_Women,platform_Amazon,platform_Flipkart
519,4.2,899,531.0,75.0,1,0,1,0,1
10966,4.0,1497,272.0,37.0,0,0,1,0,1
15068,4.1,499,61.0,6.0,1,0,1,0,1
223,3.9,499,96.0,17.0,0,0,1,0,1
10734,3.8,849,2134.0,245.0,1,1,0,0,1


In [95]:
y_valid.head()

519      849
10966    340
15068    319
223      299
10734    714
Name: price1, dtype: int64

In [79]:
y_valid[519]

849

In [96]:
ind = 3
val = pred[ind]
val

299

In [97]:
# val = 241
if(val in rf.classes_):
    print("yes")
else:
    print("no")

yes


In [99]:
pred_train = rf.predict(X_train)
y_train.head()

13681    1499
11135     498
2206      407
4446      796
14137    1470
Name: price1, dtype: int64

In [104]:
ind = 4
val = pred_train[ind]
print(val)

if(val in rf.classes_):
    print("yes")
else:
    print("no")

1379
yes


Underfit ?

////////////////////////////////////////////////////////////