# SUV Customer Classifier using Logistic Regression (Classification Model)

## Importing the libraries

In [8]:
"""
Importing libraries
"""

import matplotlib.pyplot as plt 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

## Loading in the dataset

In [6]:
# Loading dataset in.
dataset = pd.read_csv("../dataset/Social_Network_Ads.csv")

# Initialising features and dependent variable vectors
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Splitting the dataset

In [9]:
# Splitting up the training and testing sets using a 1/4 -> 3/4 ratio (3/4) of the dataset will be trained, (1/4) will be used for testing
# This is for an even split, even though the recommended is usually 1/5 -> testing, 4/5 -> training.

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [10]:
print(f'{x_test} testing features')

[[    30  87000]
 [    38  50000]
 [    35  75000]
 [    30  79000]
 [    35  50000]
 [    27  20000]
 [    31  15000]
 [    36 144000]
 [    18  68000]
 [    47  43000]
 [    30  49000]
 [    28  55000]
 [    37  55000]
 [    39  77000]
 [    20  86000]
 [    32 117000]
 [    37  77000]
 [    19  85000]
 [    55 130000]
 [    35  22000]
 [    35  47000]
 [    47 144000]
 [    41  51000]
 [    47 105000]
 [    23  28000]
 [    49 141000]
 [    28  87000]
 [    29  80000]
 [    37  62000]
 [    32  86000]
 [    21  88000]
 [    37  79000]
 [    57  60000]
 [    37  53000]
 [    24  58000]
 [    18  52000]
 [    22  81000]
 [    34  43000]
 [    31  34000]
 [    49  36000]
 [    27  88000]
 [    41  52000]
 [    27  84000]
 [    35  20000]
 [    43 112000]
 [    27  58000]
 [    37  80000]
 [    52  90000]
 [    26  30000]
 [    49  86000]
 [    57 122000]
 [    34  25000]
 [    35  57000]
 [    34 115000]
 [    59  88000]
 [    45  32000]
 [    29  83000]
 [    26  80000]
 [    49  2800

In [None]:
print(f'{y_test} testing dependent variable')

In [None]:
print(f'{x_train} training features')

In [None]:
print(f'{y_train} training dependent variable')

## Feature Scaling

In [11]:
# Implementing feature scaling:

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

## Implementing Logistic Regression

In [12]:
classifier = LogisticRegression(random_state=0)
classifier.fit(x_train, y_train)

## Predicting a single result


In [13]:
# The two values in the 2D dimensional array are the values based on the layout of the dataset.
# Here, we're predicting the first observation from x_test
# Then we transform to scale it.

classifier.predict(sc.transform([[30, 87000]]))

array([0], dtype=int64)

## Predicting the Test Results

In [14]:
# Predicting the values and then we're reshaping them to  display them as a singular column (Transpose)
# Prediction, reshaping and concatenation was retrieved from a previously made multiple linear regression model.

y_pred = classifier.predict(x_test)
reshaped_pred = y_pred.reshape(len(y_pred), 1)
reshaped_test = y_test.reshape(len(y_test), 1)

## Creating the confusion matrix

In [15]:
# Creating the confusion matrix.

"""
A confusion matrix is a 2D matrix which shows the correct predictions and the wrong
predictions for comparisons
"""

cm = confusion_matrix(y_test, y_pred)

## Evaluating the accuracy

In [16]:
print(accuracy_score(y_test, y_pred)) # Retrieved a 89% accuracy score (Evaluation)

0.89
