In [13]:
# importing required libraries
import pandas as pd
import altair as alt
import numpy as np
from sklearn.model_selection import train_test_split

In [14]:
#loading the dataset
df = pd.read_csv("Superstore.csv")

In [15]:
# Wrangling
df = df.drop_duplicates()
df = df.drop(df[df.Sales < 100].index)
df = df.drop(df[df.Profit == -6599.9780].index)

def categorise(row):  
    if row['Discount'] > 0:
        return 'Yes'
    return 'No'
df['Discount_Status'] = df.apply(lambda row: categorise(row), axis=1)
df.head()

Unnamed: 0,Ship Mode,Segment,Country,City,State,Postal Code,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit,Discount_Status
0,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Bookcases,261.96,2,0.0,41.9136,No
1,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Chairs,731.94,3,0.0,219.582,No
3,Standard Class,Consumer,United States,Fort Lauderdale,Florida,33311,South,Furniture,Tables,957.5775,5,0.45,-383.031,Yes
7,Standard Class,Consumer,United States,Los Angeles,California,90032,West,Technology,Phones,907.152,6,0.2,90.7152,Yes
9,Standard Class,Consumer,United States,Los Angeles,California,90032,West,Office Supplies,Appliances,114.9,5,0.0,34.47,No


In [16]:
# One-hot encode the data using pandas get_dummies
df = pd.get_dummies(df)

# Labels are the values we want to predict (response variable)
response = np.array(df['Sales'])

# Remove the labels from the features (predictors)
# axis 1 refers to the columns
predictors = df.drop('Sales', axis = 1)

# Saving feature names for later use
predictors_list = list(predictors.columns)

# Convert to numpy array
predictors = np.array(predictors)

# Split the data into training and testing sets
train_predictors, test_predictors, train_response, test_response = train_test_split(df, response, test_size = 0.20, random_state = 42)

In [22]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_predictors, train_response)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [55]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_predictors)

# Calculate the absolute errors
errors = (predictions - test_response)**2

# Print out the mean absolute error (mae)
print('Mean Squared Error:', round(np.mean(errors), 2), 'dollars.')

Mean Squared Error: 216603.85 dollars.


In [57]:
np.amin(errors)
np.amax(errors)

160668039.94603384

In [63]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(predictors_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Quantity             Importance: 0.93
Variable: Ship Mode_First Class Importance: 0.06
Variable: Postal Code          Importance: 0.0
Variable: Discount             Importance: 0.0
Variable: Profit               Importance: 0.0
Variable: Ship Mode_Same Day   Importance: 0.0
Variable: Ship Mode_Second Class Importance: 0.0
Variable: Ship Mode_Standard Class Importance: 0.0
Variable: Segment_Consumer     Importance: 0.0
Variable: Segment_Corporate    Importance: 0.0
Variable: Segment_Home Office  Importance: 0.0
Variable: Country_United States Importance: 0.0
Variable: City_Akron           Importance: 0.0
Variable: City_Albuquerque     Importance: 0.0
Variable: City_Alexandria      Importance: 0.0
Variable: City_Allen           Importance: 0.0
Variable: City_Allentown       Importance: 0.0
Variable: City_Amarillo        Importance: 0.0
Variable: City_Anaheim         Importance: 0.0
Variable: City_Andover         Importance: 0.0
Variable: City_Ann Arbor       Importance: 0.0
Var

In [58]:
test = np.array([1, 2, 3])

In [59]:
np.amin(test)

1

In [60]:
np.amax(test)

3

In [61]:
errors

array([1.19443041e-02, 1.85547987e-04, 1.08489867e-01, 4.78938739e-01,
       5.73360169e-01, 7.32324782e-04, 5.62828176e-02, 1.57529610e-03,
       1.77492005e-03, 2.16743173e-02, 1.78607664e-03, 2.11254597e-02,
       6.90896765e-01, 1.47059595e-01, 2.15848987e-02, 1.57081432e-03,
       9.73939264e-04, 2.17528960e-03, 3.90925058e-03, 1.50150713e-02,
       6.79957776e-04, 7.30296576e-04, 3.77961735e+00, 3.04090819e-02,
       6.37033845e-01, 2.36234882e-03, 3.77806916e-03, 2.63645279e-01,
       4.69155600e-06, 5.90733025e-04, 4.86908356e-04, 1.04565570e-01,
       6.35040001e-06, 1.84518684e+03, 3.65497924e-04, 1.57331223e-01,
       1.27993020e-02, 7.11608976e-04, 1.44677595e-02, 1.18956236e+02,
       3.94838803e-01, 8.58783025e-04, 5.05726500e-01, 4.19840100e-04,
       7.32876941e-02, 3.76670464e-04, 1.06587739e+04, 3.07506386e-02,
       2.96502030e-03, 9.10535808e-03, 1.49157369e+00, 8.53776006e-07,
       9.92494138e-03, 5.42385115e-01, 4.22713598e-06, 3.15655196e-01,
      