In [1]:
## Data preprocessing
# Step 1 : Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Step 2 : Load dataset
dataset = pd.read_csv('Data/data_pre1.csv')
dataset

Unnamed: 0,Country,Age,Salary,Purchased,L1
0,France,44.0,72000.0,No,
1,Spain,27.0,48000.0,Yes,
2,Germany,30.0,54000.0,No,2.0
3,Spain,38.0,61000.0,No,
4,Germany,40.0,,Yes,
5,France,35.0,58000.0,Yes,
6,Spain,,52000.0,No,
7,France,48.0,79000.0,Yes,4.0
8,Germany,50.0,83000.0,No,
9,France,37.0,67000.0,,


In [3]:
# Step 3 : Missing data management
# Check for missing values
dataset.isna().sum()

Country      0
Age          1
Salary       1
Purchased    1
L1           8
dtype: int64

In [4]:
# Check percentage
NAN = [(clm_name, dataset[clm_name].isna().mean() * 100) for clm_name in dataset]
NAN = pd.DataFrame(NAN, columns=["column_name", "percentage"])
NAN

Unnamed: 0,column_name,percentage
0,Country,0.0
1,Age,10.0
2,Salary,10.0
3,Purchased,10.0
4,L1,80.0


In [5]:
# Check columns which are crossing threshold
NAN[NAN['percentage'] > 50]

Unnamed: 0,column_name,percentage
4,L1,80.0


In [6]:
# Using drop methods from original dataset
dataset.drop(columns = ['L1'], axis = 1, inplace = True)
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [7]:
# For rest of missing values we can use below method
# Method 1 : replace() with mean, median, mode
dataset.Age = dataset.Age.replace(np.nan, dataset.Age.mean())
# Method 2 : fillna()
dataset.Salary = dataset.Salary.fillna(dataset.Salary.mean())
dataset.Purchased = dataset.Purchased.fillna(dataset.Purchased.mode()[0])
# Method 3 : SimpleImputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy="mean")
dataset.loc[:, ['Age', 'Salary']] = imputer.fit_transform(dataset.loc[:, ['Age', 'Salary']])
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,No


In [8]:
# Step 4 : Check data type
# Check data type each column
dataset.dtypes

Country       object
Age          float64
Salary       float64
Purchased     object
dtype: object

In [9]:
# If needed change the data type
dataset.Age = dataset.Age.astype('int32')
dataset.dtypes

Country       object
Age            int32
Salary       float64
Purchased     object
dtype: object

In [10]:
# Step 5 : Calculate measure of central dependency
# Mean
print("The average value of each columns are below.")
print("Mean\n{}\n".format(dataset.mean()))

# Median
print("The middle value of all the columns are below.")
print("Median\n{}\n".format(dataset.median()))

# Mode
print("The most common value of each column are below.")
print("Mode\n{}\n".format(dataset.mode().iloc[0]))

The average value of each columns are below.
Mean
Age          38.700000
Salary    63777.777778
dtype: float64

The middle value of all the columns are below.
Median
Age          38.000000
Salary    62388.888889
dtype: float64

The most common value of each column are below.
Mode
Country      France
Age              38
Salary        48000
Purchased        No
Name: 0, dtype: object



In [14]:
# Measure of dispersion
# Varience
print("Variance\n{}\n".format(dataset.var()))
print("""
The variance is small
- It means all column datapoints are tend to close together and close to mean.
If variance is big
- It means this column datapoints are spread-out with respect to each other and with respect to mean.
""")

# Standard deviation
print("Standard deviation\n{}\n".format(dataset.std()))
print("""
Standard deviation is small.
- It means data points are tightky clustered around mean.
Standard deviation is big.
- It means data points widely spread as compare to other columns.
""")

Variance
Age       5.267778e+01
Salary    1.337284e+08
dtype: float64


The variance is small
- It means all column datapoints are tend to close together and close to mean.
If variance is big
- It means this column datapoints are spread-out with respect to each other and with respect to mean.

Standard deviation
Age           7.257946
Salary    11564.099406
dtype: float64


Standard deviation is small.
- It means data points are tightky clustered around mean.
Standard deviation is big.
- It means data points widely spread as compare to other columns.



In [32]:
# Calculate moments
from scipy.stats import kurtosis
from scipy.stats import skew
# Skewness
print("Skewness\n{}\n".format(dataset.skew()))
skews = dataset.skew()
sk_list = list()

for i in skews:
    if(i == 0):
        sk_list.append("Normally distributed")
    elif(i < 0):
        sk_list.append("Negatively distributed")
    elif(i>0):
        sk_list.append("Positively distributed")
skewness_result = pd.Series(sk_list)
skewness_result.index = dataset.mean().index
print("The details informaton about skewness below.")
print(skewness_result)

Skewness
Age       0.029773
Salary    0.404912
dtype: float64

The details informaton about skewness below.
Age       Positively distributed
Salary    Positively distributed
dtype: object


In [34]:
# Kurtosis
print("Kurtosis\n{}\n".format(dataset.kurtosis()))
kur = dataset.kurtosis()
sk_list = list()
for i in kur:
    if(i == 0):
        sk_list.append("Mesokurtic")
    elif(i < 0):
        sk_list.append("Leptokurtic")
    elif(i>0):
        sk_list.append("Platykurtic")
kurtosis_result = pd.Series(sk_list)
kurtosis_result.index = dataset.mean().index
print("The details informaton about kurtosis below.")
print(kurtosis_result)

Kurtosis
Age      -0.466385
Salary   -0.843679
dtype: float64

The details informaton about kurtosis below.
Age       Leptokurtic
Salary    Leptokurtic
dtype: object


In [None]:
# Problem statement and solution

In [None]:
# Visualization

In [35]:
! jupyter nbconvert --to script "data_analysis.ipynb"

[NbConvertApp] Converting notebook data_analysis.ipynb to script
[NbConvertApp] Writing 3711 bytes to data_analysis.py
