## Apply following steps to dataset given in a url

### Displaying dataset

In [28]:
import pandas as pd
# reading given csv file & storing as pandas data frame
data = pd.read_csv('data_preprocessing.csv')
data            

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Handing missing data 

In [29]:
# displaying missing data column-wise if any
data.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [30]:
# filling missing values in column Age with avg age 
data['Age'].fillna(data['Age'].mean(), inplace= True)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [31]:
# filling missing values in Salary column with avg salary
data['Salary'].fillna(data['Salary'].mean(), inplace=True)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Handling categorical data

In [32]:
#  we can see that columns Country & Purchased
countries = data['Country'].unique()
countries           # unique values in column Country

array(['France', 'Spain', 'Germany'], dtype=object)

In [33]:
# assigning codes as numbers to different countries & codes be 1,2,3,...
code = 1
for country in countries:
    data.loc[data['Country'] == country,'Country'] = code
    code += 1
data

Unnamed: 0,Country,Age,Salary,Purchased
0,1,44.0,72000.0,No
1,2,27.0,48000.0,Yes
2,3,30.0,54000.0,No
3,2,38.0,61000.0,No
4,3,40.0,63777.777778,Yes
5,1,35.0,58000.0,Yes
6,2,38.777778,52000.0,No
7,1,48.0,79000.0,Yes
8,3,50.0,83000.0,No
9,1,37.0,67000.0,Yes


In [34]:
# finding unique values in column Purchased
purchased = data['Purchased'].unique()
purchased

array(['No', 'Yes'], dtype=object)

In [35]:
# for column Purchased assigning 0 to No & 1 to Yes
for answer in purchased:
    if answer == 'No':
        data.loc[data['Purchased'] == answer,'Purchased'] = 0
    else:
        data.loc[data['Purchased'] == answer,'Purchased'] = 1
data        

Unnamed: 0,Country,Age,Salary,Purchased
0,1,44.0,72000.0,0
1,2,27.0,48000.0,1
2,3,30.0,54000.0,0
3,2,38.0,61000.0,0
4,3,40.0,63777.777778,1
5,1,35.0,58000.0,1
6,2,38.777778,52000.0,0
7,1,48.0,79000.0,1
8,3,50.0,83000.0,0
9,1,37.0,67000.0,1


### Split the dataset into training set and test set

In [36]:
# Splitting 70 % dataset into train set & 30 % dataset into dataset into test set
data_copy = data.copy()         # copying data into another data frame
train_set = data_copy.sample(frac=0.7, random_state=0)   # selecting random 0.7 fraction of dataset as train set
train_set                            # chossing different random state will give different random rows

Unnamed: 0,Country,Age,Salary,Purchased
2,3,30.0,54000.0,0
8,3,50.0,83000.0,0
4,3,40.0,63777.777778,1
9,1,37.0,67000.0,1
1,2,27.0,48000.0,1
6,2,38.777778,52000.0,0
7,1,48.0,79000.0,1


In [37]:
test_set = data_copy.drop(train_set.index)            # selecting remaining i.e. 30% as test set
test_set 

Unnamed: 0,Country,Age,Salary,Purchased
0,1,44.0,72000.0,0
3,2,38.0,61000.0,0
5,1,35.0,58000.0,1


### Feature Scaling

In [38]:
# Normalising Age Column i.e. scaling age values between 0 & 1
minimum = data['Age'].min()
maximum = data['Age'].max()
for age in data['Age'].copy():
    data.loc[data['Age']== age,'Age'] = (age - minimum)/(maximum-minimum)
data['Age']

0    0.739130
1    0.000000
2    0.130435
3    0.478261
4    0.565217
5    0.347826
6    0.512077
7    0.913043
8    1.000000
9    0.434783
Name: Age, dtype: float64

In [39]:
# Normalising Salary Column i.e. scaling salary values between 0 & 1
minimum = data['Salary'].min()
maximum = data['Salary'].max()
for salary in data['Salary'].copy():
    data.loc[data['Salary']== salary,'Salary'] = (salary - minimum)/(maximum-minimum)
data['Salary']

0    0.685714
1    0.000000
2    0.171429
3    0.371429
4    0.450794
5    0.285714
6    0.114286
7    0.885714
8    1.000000
9    0.542857
Name: Salary, dtype: float64

In [40]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,1,0.73913,0.685714,0
1,2,0.0,0.0,1
2,3,0.130435,0.171429,0
3,2,0.478261,0.371429,0
4,3,0.565217,0.450794,1
5,1,0.347826,0.285714,1
6,2,0.512077,0.114286,0
7,1,0.913043,0.885714,1
8,3,1.0,1.0,0
9,1,0.434783,0.542857,1
