In [None]:
#import key libraries
import numpy as np
import pandas as pd

In [None]:
#load in training and testing data
training_data = pd.read_csv("train.csv")
testing_data = pd.read_csv("test.csv")

In [None]:
print(training_data.head())

In [None]:
#store the id numbers of the testing data entries
ids = np.array(testing_data["Id"])

In [None]:
#delete the ID column
training_data = training_data.drop("Id",axis = 1)
testing_data = testing_data.drop("Id",axis = 1)

In [None]:
#delete the open date column and replace it with a feature detailing how long the restaurant has been open
current_date = pd.to_datetime("today")
current_year = current_date.year

training_data["Open Date"] = pd.to_datetime(training_data["Open Date"])
testing_data["Open Date"] = pd.to_datetime(testing_data["Open Date"])

train_years = training_data["Open Date"].dt.year
test_years = testing_data["Open Date"].dt.year

training_data["Years Open"] = current_year - train_years
testing_data["Years Open"] = current_year - test_years

training_data = training_data.drop("Open Date",axis = 1)
testing_data = testing_data.drop("Open Date",axis = 1)

In [None]:
#delete the specific city names since the important information is encapsulated by other features like city types
training_data = training_data.drop("City",axis = 1)
testing_data = testing_data.drop("City",axis = 1)

In [None]:
#Replace the city group feature with two columns (big cities and other) whose values are either 1 or 0
city_group_train = pd.get_dummies(training_data["City Group"])
city_group_test = pd.get_dummies(testing_data["City Group"])

training_data = training_data.join(city_group_train)
testing_data = testing_data.join(city_group_test)

training_data = training_data.drop("City Group",axis = 1)
testing_data = testing_data.drop("City Group",axis =1)

In [None]:
print(training_data.head())

In [None]:
#replace the restaurant type feature with categorical columns
city_type_train = pd.get_dummies(training_data["Type"])
city_type_test = pd.get_dummies(testing_data["Type"])

training_data = training_data.join(city_type_train)
testing_data = testing_data.join(city_type_test)

training_data = training_data.drop("Type",axis = 1)
testing_data = testing_data.drop("Type",axis =1)

In [None]:
print(training_data.head())
print(testing_data.head())

In [None]:
#delete the testing column for MB since it isn't present in the training set
testing_data = testing_data.drop("MB",axis = 1)

In [None]:
#save the training and testing dataframes for future use
training_data.to_csv("processed_train_data.csv")
testing_data.to_csv("processed_test_data.csv")

In [None]:
#separate revenue from the rest of the training data
training_targets = training_data["revenue"]
training_data = training_data.drop("revenue",axis = 1)

In [None]:
print(training_data.shape)
print(testing_data.shape)

In [None]:
#create and train a random forest model with 1000 trees
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 1000,n_jobs = -1)
fitter = model.fit(training_data,training_targets)
predictions = model.predict(testing_data)

In [None]:
print(predictions[:10])

In [None]:
#store the model's predictions
final_preds = pd.DataFrame({"id": ids,"Prediction": predictions})

In [None]:
print(final_preds.head())

In [None]:
#write the predictions to file
final_preds.to_csv("revenue_prediction.csv",index = False)

In [None]:
#save the model for separate analysis
import pickle
filename = "random_forest_model.sav"
pickle.dump(model,open(filename,"wb"))