# PART- 3 `( Handling Outliers )`

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy import stats
from scipy.stats.mstats import winsorize

import warnings
warnings.filterwarnings('ignore')
warnings.warn("this will not show")

%matplotlib inline
# %matplotlib notebook

plt.rcParams["figure.figsize"] = (10,6)
# plt.rcParams['figure.dpi'] = 100

sns.set_style("whitegrid")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 150

In [None]:
df = pd.read_csv("filled_scout.csv")

## functions to fill the missing values

In [None]:
def fill_most(df, group_col, col_name):
    '''Fills the missing values with the most existing value (mode) in the relevant column according to single-stage grouping'''
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        mode = list(df[cond][col_name].mode())
        if mode != []:
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[cond][col_name].mode()[0])
        else:
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[col_name].mode()[0])
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [None]:
def fill_prop(df, group_col, col_name):
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        df.loc[cond, col_name] = df.loc[cond, col_name].fillna(method="ffill").fillna(method="bfill")
    df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [None]:
def fill(df, group_col1, group_col2, col_name, method): # method can be "mode" or "median" or "ffill"
    if method == "mode":
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond1 = df[group_col1]==group1
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                mode1 = list(df[cond1][col_name].mode())
                mode2 = list(df[cond2][col_name].mode())
                if mode2 != []:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond2][col_name].mode()[0])
                elif mode1 != []:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond1][col_name].mode()[0])
                else:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[col_name].mode()[0])
                
    elif method == "median":
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond1 = df[group_col1]==group1
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond2][col_name].median()).fillna(df[cond1][col_name].median()).fillna(df[col_name].median())
                
    elif method == "ffill":           
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(method="ffill").fillna(method="bfill")
                
        for group1 in list(df[group_col1].unique()):
            cond1 = df[group_col1]==group1
            df.loc[cond1, col_name] = df.loc[cond1, col_name].fillna(method="ffill").fillna(method="bfill")            
           
        df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [None]:
df.shape

In [None]:
df.head(1).T

## km

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(141)
plt.hist(df.km, bins = 50)

plt.subplot(142)
plt.boxplot(df.km)

plt.subplot(143)
plt.boxplot(df.km, whis = 2.5)

plt.subplot(144)
plt.boxplot(np.log(df.km), whis = 2.5)
plt.show()

In [None]:
#df["km_logged"] = np.log(df.km)

km is a variable with a wide range of values. We can not consider the outlier values above as a real outliers. So let's keep it as it is.

## cons_comb

In [None]:
df.cons_comb.describe()

In [None]:
df.cons_comb.sort_values().head(10)

In [None]:
df.cons_comb.sort_values().tail(10)

Using our domain knowledge we can conclude that min value of 3 and max value of 9.10 are normal. But anyways, let's analyze outliers with plotting, np.log and winsorize methods.

In [None]:
plt.figure(figsize = (10, 6))
plt.subplot(121)
plt.hist(df.cons_comb, bins = 25)

plt.subplot(122)
plt.boxplot(df.cons_comb, whis = 2.5)

plt.show()

In [None]:
plt.figure(figsize = (10, 6))
plt.subplot(121)
plt.hist(np.log(df.cons_comb), bins = 25)

plt.subplot(122)
plt.boxplot(np.log(df.cons_comb), whis = 2)

plt.show()

In [None]:
plt.figure(figsize = (10, 6))
plt.subplot(121)
plt.hist(winsorize(df.cons_comb, (0.003)), bins = 22)

plt.subplot(122)
plt.boxplot(winsorize(df.cons_comb, (0.003)), whis = 2)

plt.show()

## displacement_cc

In [None]:
df.Displacement_cc.describe()

In [None]:
df.sort_values(by = ["Displacement_cc"], ascending = False)["Displacement_cc"].head(55)

In [None]:
df.sort_values(by = ["Displacement_cc"], ascending = True)["Displacement_cc"].head(55)

In [None]:
outlier_bool_list = df.Displacement_cc.isin([1, 2, 54, 122, 139, 140, 160, 16000, 15898])

In [None]:
outlier_bool_list

In [None]:
df.loc[outlier_bool_list, "Displacement_cc"] = np.nan

In [None]:
fill(df, "make_model", "body_type", "Displacement_cc", "mode")

In [None]:
plt.figure(figsize=(10,6))
plt.subplot(121)
plt.hist(df.Displacement_cc, bins = 20)

plt.subplot(122)
plt.boxplot(df.Displacement_cc, whis = 2.5)

plt.show()

- There is no problem in terms of outliers

## Nr_of_Doors

In [None]:
df.Nr_of_Doors.value_counts(dropna = False)

In [None]:
df.Nr_of_Doors.replace([1, 7], np.nan, inplace=True)

In [None]:
df.Nr_of_Doors.value_counts(dropna = False)

In [None]:
fill(df, "make_model", "body_type", "Nr_of_Doors", "mode")

In [None]:
df.Nr_of_Doors.value_counts(dropna = False).plot.bar()

In [None]:
df.groupby(["make_model", "Nr_of_Doors"]).Nr_of_Doors.count().plot.bar()

In [None]:
df.drop("Nr_of_Doors", axis=1, inplace=True)

## Nr_of_Seats

In [None]:
df.Nr_of_Seats.value_counts(dropna = False)

In [None]:
df.Nr_of_Seats.replace([3, 6], np.nan, inplace=True)

In [None]:
df.Nr_of_Seats.value_counts(dropna = False)

In [None]:
fill(df, "make_model", "body_type", "Nr_of_Seats", "mode")

In [None]:
df.Nr_of_Seats.value_counts(dropna = False).plot.bar()

In [None]:
df.groupby(["make_model", "Nr_of_Seats"]).Nr_of_Seats.count().plot.bar()

In [None]:
df.drop("Nr_of_Seats", axis=1, inplace=True)

## Weight_kg

In [None]:
df.Weight_kg.describe()

In [None]:
df["Weight_kg"].sort_values().head(10)

In [None]:
df["Weight_kg"].replace([1, 102], np.nan, inplace = True)

In [None]:
df["Weight_kg"].isnull().sum()

In [None]:
fill(df, "make_model", "body_type", "Weight_kg", "mode")

In [None]:
plt.figure(figsize=(10,6))

plt.subplot(121)
plt.hist(df.Weight_kg, bins=15)

plt.subplot(122)
plt.boxplot(df.Weight_kg, whis=2.5)

plt.show()

In [None]:
plt.figure(figsize=(10,6))

plt.subplot(121)
plt.hist(np.log(df.Weight_kg), bins=15)

plt.subplot(122)
plt.boxplot(np.log(df.Weight_kg), whis=2.5)

plt.show()

## hp_kW

In [None]:
df.hp_kW.describe()

In [None]:
df.hp_kW.sort_values().head(50)

In [None]:
df.hp_kW.unique()

In [None]:
df.hp_kW.loc[df.hp_kW < 40] = np.nan

In [None]:
df.hp_kW.isnull().sum()

In [None]:
fill(df, "make_model", "body_type", "hp_kW", "mode")

In [None]:
plt.figure(figsize=(10,6))

plt.subplot(121)
plt.hist(df.hp_kW, bins=15)

plt.subplot(122)
plt.boxplot(df.hp_kW, whis=2.5)

plt.show()

In [None]:
plt.figure(figsize=(10,6))

plt.subplot(121)
plt.hist(np.log(df.hp_kW), bins=15)

plt.subplot(122)
plt.boxplot(np.log(df.hp_kW), whis=2.5)

plt.show()

## CO2_Emission

In [None]:
df.CO2_Emission.describe()

In [None]:
df.CO2_Emission.value_counts().sort_index()

In [None]:
df.groupby(["make_model", "body_type","CO2_Emission"]).price.describe()

In [None]:
cond = ((df.CO2_Emission < 52) | (df.CO2_Emission > 330))
df.loc[cond, "CO2_Emission"] = np.nan

In [None]:
df.CO2_Emission.isnull().sum()

In [None]:
fill(df, "make_model", "body_type", "CO2_Emission", "median")

In [None]:
plt.figure(figsize=(10,6))

plt.subplot(121)
plt.hist(df.CO2_Emission, bins=15)

plt.subplot(122)
plt.boxplot(df.CO2_Emission, whis=2.5)

plt.show()

In [None]:
plt.figure(figsize=(10,6))

plt.subplot(121)
plt.hist(np.log(df.CO2_Emission), bins=15)

plt.subplot(122)
plt.boxplot(np.log(df.CO2_Emission), whis=2.5)

plt.show()

## price

In [None]:
df.price.describe()

In [None]:
df.price.sort_values().head(5)

In [None]:
df.drop(index = [8594, 8828, 6066,8829], axis = 0, inplace = True)

In [None]:
df.reset_index(drop = True, inplace = True)

In [None]:
df.shape

In [None]:
plt.figure(figsize=(10,6))
plt.subplot(121)
plt.hist(df.price, bins=20)

plt.subplot(122)
plt.boxplot(df.price, whis=3)

plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.subplot(121)
plt.hist(np.log(df.price), bins=20)

plt.subplot(122)
plt.boxplot(np.log(df.price), whis=2.3)

plt.show()

In [None]:
plt.figure(figsize=(16,6))
sns.boxplot(x="make_model", y="price", data=df, whis=2.3)
plt.show()

In [None]:
plt.figure(figsize=(16,6))
sns.boxplot(x="make_model", y=np.log(df.price), data=df, whis=2.3)
plt.show()

## make_model

In [None]:
df[df.make_model=="Audi A2"]

In [None]:
df.drop(index = [2614], inplace = True)

In [None]:
df.reset_index(drop = True, inplace = True)

In [None]:
df.shape

## Final_Step

In [None]:
df.head(3).T

In [None]:
numeric_col = "price,km,Gears,Previous_Owners,cons_comb,Displacement_cc,age,hp_kW,Weight_kg,CO2_Emission".split(",")

In [None]:
df_num = df[numeric_col]

In [None]:
df_num

In [None]:
plt.figure(figsize = (16,12))
sns.heatmap(df_num.corr(), annot = True, cmap="RdYlGn", linewidths=0.2, annot_kws = {"size": 16})

In [None]:
sns.pairplot(df_num, size = 2.5)
plt.show()

## Dummy Operation

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.to_csv("final_scout_not_dummy.csv", index=False)

In [None]:
df = df.join(df["Comfort_Convenience"].str.get_dummies(sep = ",").add_prefix("cc_"))
df = df.join(df["Entertainment_Media"].str.get_dummies(sep = ",").add_prefix("em_"))
df = df.join(df["Extras"].str.get_dummies(sep = ",").add_prefix("ex_"))
df = df.join(df["Safety_Security"].str.get_dummies(sep = ",").add_prefix("ss_"))

In [None]:
df.drop(["Comfort_Convenience","Entertainment_Media","Extras","Safety_Security"], axis=1, inplace=True)

In [None]:
df.shape

In [None]:
df_final = pd.get_dummies(df, drop_first=True)

In [None]:
df_final.shape

In [None]:
df_final.head().T

In [None]:
df_final.to_csv("final_scout_dummy.csv", index=False)