## Can you predict if a candy is chocolate or not based on its other features

### Data Preprocessing

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
 
candy_data = pd.read_csv('candy-data.csv')
candy_data.head()

Unnamed: 0,competitorname,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
0,100 Grand,1,0,1,0,0,1,0,1,0,0.732,0.86,66.971725
1,3 Musketeers,1,0,0,0,1,0,0,1,0,0.604,0.511,67.602936
2,One dime,0,0,0,0,0,0,0,0,0,0.011,0.116,32.261086
3,One quarter,0,0,0,0,0,0,0,0,0,0.011,0.511,46.116505
4,Air Heads,0,1,0,0,0,0,0,0,0,0.906,0.511,52.341465


#### Checking for null values

In [2]:
candy_data.isna().sum()

competitorname      0
chocolate           0
fruity              0
caramel             0
peanutyalmondy      0
nougat              0
crispedricewafer    0
hard                0
bar                 0
pluribus            0
sugarpercent        0
pricepercent        0
winpercent          0
dtype: int64

In [3]:
candy_data.describe()

Unnamed: 0,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
count,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0
mean,0.435294,0.447059,0.164706,0.164706,0.082353,0.082353,0.176471,0.247059,0.517647,0.478647,0.468882,50.316764
std,0.498738,0.50014,0.373116,0.373116,0.276533,0.276533,0.383482,0.433861,0.502654,0.282778,0.28574,14.714357
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011,0.011,22.445341
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.255,39.141056
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.465,0.465,47.829754
75%,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.732,0.651,59.863998
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.988,0.976,84.18029


In [4]:
# Removing unnecessary column
candy_data.drop('competitorname', axis=1, inplace=True)

In [5]:
features = [col for col in candy_data.columns if col != 'chocolate']
label = 'chocolate'

#### Separating out features & label

In [6]:
x_values = candy_data.loc[:, features].values
y_values = candy_data[label].values

#### Scaling last 3 columns which are continuos values

In [7]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_values[:,-3:] = sc_x.fit_transform(x_values[:, -3:])

#### Splitting dataset into train set & test set

In [8]:
from sklearn.model_selection import train_test_split
train_x_values, test_x_values, train_y_values, test_y_values = train_test_split(x_values, y_values, train_size=0.7, random_state=0)

### Building classification model

In [9]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(train_x_values, train_y_values)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
test_prediction = classifier.predict(test_x_values)

In [11]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(test_y_values, test_prediction)

0.8846153846153846

In [12]:
confusion_matrix(test_y_values, test_prediction)

array([[15,  0],
       [ 3,  8]])