In [1]:
#Data.csv

**Step 1: Importing the libraries**

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

**Step 2: Importing dataset**

In [3]:
df=pd.read_csv("data.csv")
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [4]:
df.shape#shape of the df

(10, 4)

In [5]:
df.info()#datatype of each feature

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


**Step 3: Handling the missing data**

In [6]:
#finding the missing value
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [7]:
#to fill the nan values we take the mean of it
avg_age=df["Age"].mean()
avg_salary=df["Salary"].mean()
df["Age"].fillna(avg_age,inplace=True)
df["Salary"].fillna(avg_salary,inplace=True)

In [8]:
#after fill the value check the nan value
df.isnull().sum()

Country      0
Age          0
Salary       0
Purchased    0
dtype: int64

**Step 4: Encoding categorical data**

In [9]:
df.Country.unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [10]:
df["Purchased"]=df["Purchased"].map({"No":0,"Yes":1})

In [11]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,63777.777778,1
5,France,35.0,58000.0,1
6,Spain,38.777778,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


In [12]:
#from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
#label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
#df['Country']= label_encoder.fit_transform(df['Country'])
  
#df['Country'].unique()

In [13]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,63777.777778,1
5,France,35.0,58000.0,1
6,Spain,38.777778,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


**Step 5: Creating a dummy variable**

In [14]:
df=pd.get_dummies(df,drop_first=True)

In [15]:
df.rename(columns={"Country_Germany":"Germany","Country_Spain":"Spain"},inplace=True)

In [16]:
df

Unnamed: 0,Age,Salary,Purchased,Germany,Spain
0,44.0,72000.0,0,0,0
1,27.0,48000.0,1,0,1
2,30.0,54000.0,0,1,0
3,38.0,61000.0,0,0,1
4,40.0,63777.777778,1,1,0
5,35.0,58000.0,1,0,0
6,38.777778,52000.0,0,0,1
7,48.0,79000.0,1,0,0
8,50.0,83000.0,0,1,0
9,37.0,67000.0,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [17]:
x=df.drop("Purchased",axis=1)
y=df["Purchased"]

In [18]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [19]:
#shape
x_train.shape

(7, 4)

In [20]:
y_train

9    1
4    1
2    0
6    0
8    0
5    1
7    1
Name: Purchased, dtype: int64

**Step 7: Feature Scaling**

In [21]:
#scaling we can use standard scalar and minmax scalar
scaler = StandardScaler()
model = scaler.fit(x)
dataframe= model.transform(x)

In [22]:
dataframe

array([[ 7.58874362e-01,  7.49473254e-01, -6.54653671e-01,
        -6.54653671e-01],
       [-1.71150388e+00, -1.43817841e+00, -6.54653671e-01,
         1.52752523e+00],
       [-1.27555478e+00, -8.91265492e-01,  1.52752523e+00,
        -6.54653671e-01],
       [-1.13023841e-01, -2.53200424e-01, -6.54653671e-01,
         1.52752523e+00],
       [ 1.77608893e-01,  6.63219199e-16,  1.52752523e+00,
        -6.54653671e-01],
       [-5.48972942e-01, -5.26656882e-01, -6.54653671e-01,
        -6.54653671e-01],
       [ 0.00000000e+00, -1.07356980e+00, -6.54653671e-01,
         1.52752523e+00],
       [ 1.34013983e+00,  1.38753832e+00, -6.54653671e-01,
        -6.54653671e-01],
       [ 1.63077256e+00,  1.75214693e+00,  1.52752523e+00,
        -6.54653671e-01],
       [-2.58340208e-01,  2.93712492e-01, -6.54653671e-01,
        -6.54653671e-01]])