In [None]:
#Data.csv

**Step 1: Importing the libraries**

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
from sklearn.impute import SimpleImputer

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,classification_report,plot_roc_curve,accuracy_score

**Step 2: Importing dataset**

In [29]:
df = pd.read_csv('Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [30]:
df.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [32]:
df.dtypes

Country       object
Age          float64
Salary       float64
Purchased     object
dtype: object

In [33]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

**Step 3: Handling the missing data**

In [34]:
imp = SimpleImputer(missing_values= np.nan,strategy = 'most_frequent')
imp.fit(df)
df = imp.transform(df)
df1 = pd.DataFrame(df)
df1.head(20)

Unnamed: 0,0,1,2,3
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,48000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [35]:
df1.rename(columns = {0:'Country', 1:'Age', 2:'Salary', 3:'Purchased'}, inplace = True)
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,48000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [36]:
df1.isnull().sum()

Country      0
Age          0
Salary       0
Purchased    0
dtype: int64

**Step 4: Encoding categorical data**

In [37]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
label.fit(df1['Purchased'])
print(label.classes_)
df1['Purchased1'] = label.transform(df1['Purchased'])
df1.head(20)

['No' 'Yes']


Unnamed: 0,Country,Age,Salary,Purchased,Purchased1
0,France,44.0,72000.0,No,0
1,Spain,27.0,48000.0,Yes,1
2,Germany,30.0,54000.0,No,0
3,Spain,38.0,61000.0,No,0
4,Germany,40.0,48000.0,Yes,1
5,France,35.0,58000.0,Yes,1
6,Spain,27.0,52000.0,No,0
7,France,48.0,79000.0,Yes,1
8,Germany,50.0,83000.0,No,0
9,France,37.0,67000.0,Yes,1


In [38]:
df1.drop(['Purchased'], axis=1,inplace = True)

In [39]:
df1

Unnamed: 0,Country,Age,Salary,Purchased1
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,48000.0,1
5,France,35.0,58000.0,1
6,Spain,27.0,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


In [40]:
df1.rename(columns = {'Purchased1':'Purchased'}, inplace = True)
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,48000.0,1
5,France,35.0,58000.0,1
6,Spain,27.0,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


**Step 5: Creating a dummy variable**

In [42]:
df = pd.get_dummies(df1,columns = ["Country"]) 
df

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,48000.0,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,27.0,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Age              10 non-null     object
 1   Salary           10 non-null     object
 2   Purchased        10 non-null     int32 
 3   Country_France   10 non-null     uint8 
 4   Country_Germany  10 non-null     uint8 
 5   Country_Spain    10 non-null     uint8 
dtypes: int32(1), object(2), uint8(3)
memory usage: 358.0+ bytes


In [44]:
df.describe()

Unnamed: 0,Purchased,Country_France,Country_Germany,Country_Spain
count,10.0,10.0,10.0,10.0
mean,0.5,0.4,0.3,0.3
std,0.527046,0.516398,0.483046,0.483046
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.5,0.0,0.0,0.0
75%,1.0,1.0,0.75,0.75
max,1.0,1.0,1.0,1.0


In [45]:
df.corr()

Unnamed: 0,Purchased,Country_France,Country_Germany,Country_Spain
Purchased,1.0,0.408248,-0.218218,-0.218218
Country_France,0.408248,1.0,-0.534522,-0.534522
Country_Germany,-0.218218,-0.534522,1.0,-0.428571
Country_Spain,-0.218218,-0.534522,-0.428571,1.0


**Step 6: Splitting the datasets into training sets and Test sets**

In [52]:
X = df[['Age','Salary','Country_France','Country_Germany','Country_Spain']].values
y = df[['Purchased']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [53]:
X_train

array([[44.0, 72000.0, 1, 0, 0],
       [38.0, 61000.0, 0, 0, 1],
       [40.0, 48000.0, 0, 1, 0],
       [37.0, 67000.0, 1, 0, 0],
       [48.0, 79000.0, 1, 0, 0],
       [27.0, 48000.0, 0, 0, 1],
       [30.0, 54000.0, 0, 1, 0]], dtype=object)

In [55]:
X_test

array([[50.0, 83000.0, 0, 1, 0],
       [35.0, 58000.0, 1, 0, 0],
       [27.0, 52000.0, 0, 0, 1]], dtype=object)

**Step 7: Feature Scaling**

In [49]:
scaler = StandardScaler()
scaler.fit(X_train,y_train) 
X_train_scaled = scaler.transform(X_train) 
X_test_scaled = scaler.transform(X_test) 

In [56]:
X_train_scaled

array([[ 1.67313909,  1.87677705, -0.63245553,  1.15470054, -0.63245553],
       [ 0.03891021,  0.03586198, -0.63245553, -0.8660254 ,  1.58113883],
       [ 0.85602465,  0.95631951,  1.58113883, -0.8660254 , -0.63245553],
       [-1.45913292, -1.05195147, -0.63245553, -0.8660254 ,  1.58113883],
       [ 0.31128169, -1.05195147, -0.63245553,  1.15470054, -0.63245553],
       [-1.0505757 , -0.54988372, -0.63245553,  1.15470054, -0.63245553],
       [-0.36964701, -0.21517189,  1.58113883, -0.8660254 , -0.63245553]])

In [57]:
X_test_scaled

array([[-0.09727553,  0.53792973,  1.58113883, -0.8660254 , -0.63245553],
       [-1.45913292, -0.71723964, -0.63245553, -0.8660254 ,  1.58113883],
       [ 1.40076761,  1.54206522,  1.58113883, -0.8660254 , -0.63245553]])