In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
model = pickle.load(open("random_forest_model.sav","rb"))

In [3]:
training_data = pd.read_csv("processed_train_data.csv")
training_targets = training_data["revenue"]
training_data.drop(["Unnamed: 0","revenue"],axis = 1,inplace = True)


In [4]:
cols = training_data.columns
print(len(cols))

43


In [5]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
for i in indices:
    print("Feature importance of %s is %f" %(cols[i],importances[i]))

Feature importance of P29 is 0.222021
Feature importance of Years Open is 0.199540
Feature importance of P28 is 0.061920
Feature importance of P20 is 0.041254
Feature importance of P23 is 0.034180
Feature importance of P6 is 0.034115
Feature importance of P22 is 0.030476
Feature importance of P17 is 0.027760
Feature importance of P19 is 0.026481
Feature importance of P2 is 0.025714
Feature importance of P1 is 0.024516
Feature importance of P5 is 0.023115
Feature importance of P21 is 0.021092
Feature importance of P11 is 0.020605
Feature importance of P12 is 0.017910
Feature importance of P3 is 0.014886
Feature importance of P8 is 0.013702
Feature importance of P4 is 0.012877
Feature importance of P10 is 0.012180
Feature importance of P25 is 0.011336
Feature importance of P13 is 0.010841
Feature importance of P27 is 0.010533
Feature importance of FC is 0.010219
Feature importance of IL is 0.007023
Feature importance of Big Cities is 0.006748
Feature importance of P9 is 0.006601
Feature 

In [6]:
top_features = np.array(cols[indices[:10]])
print(top_features)

['P29' 'Years Open' 'P28' 'P20' 'P23' 'P6' 'P22' 'P17' 'P19' 'P2']


In [7]:
train_data = training_data[top_features]
print(train_data.shape)

(137, 10)


In [8]:
print(train_data.head())

   P29  Years Open  P28  P20  P23  P6  P22  P17  P19   P2
0  3.0          19  2.0    4    3   2    3    2    5  5.0
1  3.0          10  3.0    2    2   2    3    0    3  5.0
2  3.0           5  1.0    1    1   3    1    0    1  4.0
3  7.5           6  2.5   12   10   4    1    3   20  4.5
4  3.0           9  1.0    2    1   2    2    1    2  4.0


In [9]:
testing_data = pd.read_csv("processed_test_data.csv")
testing_data.drop("Unnamed: 0",axis = 1,inplace = True)

In [10]:
print(testing_data.shape)
print(testing_data.head())

(100000, 43)
   P1   P2   P3   P4  P5  P6  P7  P8  P9  P10 ...  P34  P35  P36  P37  \
0   1  4.0  4.0  4.0   1   2   5   4   5    5 ...    0    0    0    0   
1   3  4.0  4.0  4.0   2   2   5   3   4    4 ...    0    0    0    0   
2   3  4.0  4.0  4.0   2   2   5   4   4    5 ...    0    0    0    0   
3   2  4.0  4.0  4.0   2   3   5   4   5    4 ...    0    0    0    0   
4   2  4.0  4.0  4.0   1   2   5   4   5    4 ...    0    0    0    0   

   Years Open  Big Cities  Other  DT  FC  IL  
0           7           0      1   0   1   0  
1           7           0      1   0   0   1  
2           5           1      0   0   1   0  
3           5           0      1   0   0   1  
4           5           0      1   0   1   0  

[5 rows x 43 columns]


In [11]:
test_data = testing_data[top_features]
print(test_data.shape)

(100000, 10)


In [12]:
print(test_data.head())

   P29  Years Open  P28  P20  P23  P6  P22  P17  P19   P2
0  3.0           7  2.0    5    4   2    1    2    5  4.0
1  3.0           7  1.0    5    1   2    2    0    5  4.0
2  3.0           5  2.0    5    5   2    5    0    5  4.0
3  3.0           5  2.0    4    2   3    2    0    4  4.0
4  3.0           5  5.0    5    1   2    1    0    1  4.0


In [13]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators = 1000,n_jobs = -1)
fit = model.fit(train_data,training_targets)
predictions = model.predict(test_data)

In [19]:
import pickle
filename = "random_forest_selected.sav"
pickle.dump(model,open(filename,"wb"))

In [14]:
final = pd.DataFrame(predictions,columns = ["Revenue"])
print(final.head())

       Revenue
0  4353922.906
1  3334523.742
2  3399228.675
3  2916814.925
4  4684316.184


In [15]:
final.to_csv("select_features_prediction.csv")

In [16]:
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


In [17]:
num_variables = len(top_features)

def basic_model():
    nn_model = Sequential()
    nn_model.add(Dense(num_variables,input_dim = num_variables,activation = "relu"))
    nn_model.add(Dense(1))
    nn_model.compile(loss = "mean_squared_error",optimizer = "adam")
    return nn_model

In [18]:
print(train_data.shape)
print(training_targets.shape)

(137, 10)
(137,)
