# Predicting the score of customers.

`RandomForestRegressor` will be used to deal with that. The predicting column `"Spending Score (1-100)"` will be them divided into 4 categories and `RandomForestClassifier` used instead.

In [1]:
import sklearn
import numpy as np
import pandas as pd

In [2]:
mall_customers = pd.read_csv('Mall_Customers.csv', index_col=0)
mall_customers.head()

Unnamed: 0_level_0,Genre,Age,Annual Income (k$),Spending Score (1-100)
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Male,19,15,39
2,Male,21,15,81
3,Female,20,16,6
4,Female,23,16,77
5,Female,31,17,40


In [4]:
# Splitting into X/y:
X = mall_customers.drop("Spending Score (1-100)", axis=1)
y = mall_customers["Spending Score (1-100)"]

In [5]:
# converting into numerical feature:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Genre"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                  one_hot,
                                  categorical_features)],
                                 remainder='passthrough')

transformed_X = transformer.fit_transform(X)

In [6]:
# Preparing data:
np.random.seed(100)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

#Building model:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.32224686766273636

In [7]:
# Improving (tested many times and 50.37% accuracy is best for now):
np.random.seed(0)
for i in range(10, 41, 10):
    model = RandomForestRegressor(n_estimators=i, min_samples_leaf=3, min_samples_split=3)
    model.fit(X_train, y_train)
    print(f"Using {i} estimators: {model.score(X_test, y_test) * 100:.2f}%")
    print("")

Using 10 estimators: 32.03%

Using 20 estimators: 35.51%

Using 30 estimators: 44.97%

Using 40 estimators: 50.37%



In [9]:
# saving model:
import pickle

# pickle.dump(model, open("random-forest-regressor-model-1.pkl", "wb"))

In [10]:
# Trying to split "Annual Income (k$)" into 4 categories.
mall_customers.describe()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0
mean,38.85,60.56,50.2
std,13.969007,26.264721,25.823522
min,18.0,15.0,1.0
25%,28.75,41.5,34.75
50%,36.0,61.5,50.0
75%,49.0,78.0,73.0
max,70.0,137.0,99.0


In [32]:
# Converting "Spending Score (1-100)" into 4 percentile categories (25th, 50th, 75th):
cat = np.array([])
for i in mall_customers["Spending Score (1-100)"]:
    if i <= 34.75:
        cat = np.append(cat, 1)
    elif i > 34.75 and i <= 50.0:
        cat = np.append(cat, 2)
    elif i > 50.0 and i <= 73.0:
        cat = np.append(cat, 3)
    else:
        cat = np.append(cat, 4)
        
cat

array([2., 4., 1., 4., 2., 4., 1., 4., 1., 3., 1., 4., 1., 4., 1., 4., 2.,
       3., 1., 4., 2., 3., 1., 3., 1., 4., 1., 3., 1., 4., 1., 3., 1., 4.,
       1., 4., 1., 3., 1., 4., 2., 4., 2., 3., 1., 3., 3., 2., 2., 2., 3.,
       3., 3., 3., 2., 2., 2., 2., 3., 2., 3., 3., 3., 3., 3., 3., 2., 2.,
       3., 2., 3., 2., 2., 3., 2., 3., 3., 2., 3., 2., 3., 3., 2., 2., 3.,
       2., 3., 3., 3., 2., 3., 2., 2., 2., 2., 3., 2., 2., 2., 2., 2., 2.,
       3., 3., 3., 2., 2., 2., 2., 2., 3., 3., 2., 2., 2., 2., 2., 3., 2.,
       3., 3., 2., 3., 4., 1., 4., 2., 4., 1., 4., 1., 4., 1., 3., 1., 4.,
       1., 3., 1., 3., 1., 4., 2., 4., 1., 4., 2., 4., 1., 4., 1., 4., 1.,
       4., 1., 4., 1., 4., 1., 3., 2., 4., 1., 4., 1., 4., 1., 4., 1., 3.,
       1., 4., 1., 4., 1., 4., 1., 3., 1., 4., 1., 4., 1., 4., 2., 4., 1.,
       3., 1., 4., 1., 3., 1., 4., 1., 4., 1., 4., 1., 4.])

In [62]:
# Split into X/y:
np.random.seed(20)
X = mall_customers.drop(["Spending Score (1-100)"], axis=1)
y = cat

categorical_features = ["Genre"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                  one_hot,
                                  categorical_features)],
                                remainder='passthrough')

transformed_X = transformer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.6

In [77]:
np.random.seed(40)
for i in range(10, 51, 10):
    clf = RandomForestClassifier(n_estimators=i, min_samples_leaf=10, min_samples_split=10, max_depth=15)
    clf.fit(X_train, y_train)
    print(f"Using {i} estimators: {clf.score(X_test, y_test) * 100:.2f}%")
    print("")

Using 10 estimators: 65.00%

Using 20 estimators: 60.00%

Using 30 estimators: 67.50%

Using 40 estimators: 67.50%

Using 50 estimators: 70.00%



In [82]:
import pickle

# pickle.dump(clf, open("random-forest-classifier-model-1.pkl", "wb"))