In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
# import pandas.rpy.common as com
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn import linear_model

%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# load data
train_df = pd.read_csv('/kaggle/input/car-data/LEVEL2_Car_Details_Cleaned_train_data.csv')
test_df = pd.read_csv('/kaggle/input/car-data/LEVEL2_Car_Details_Cleaned_test_data.csv')

In [None]:
# droping index columns
print('Before...')
print('Train df columns :', list(train_df.columns))
print('Test df columns :', list(test_df.columns))

train_df.drop(['Unnamed: 0'], axis=1, inplace=True)
test_df.drop(['index'], axis=1, inplace=True)

print('\nAfter...')
print('Train df columns :', list(train_df.columns))
print('Test df columns :', list(test_df.columns))

In [None]:
# Cleaning milage
unique_units = set()
def mileage_to_float(mileage):
    mil = mileage.split(" ")
    unique_units.add(mil[1])
    val = float(mil[0])
    return val

train_df['mileage'] = train_df['mileage'].apply(mileage_to_float)
test_df['mileage'] = test_df['mileage'].apply(mileage_to_float)

# print('Different units: ', unique_units)
train_df.plot.scatter(x='mileage',
                      y='selling_price',
                      c='DarkBlue')

In [None]:
# Cleaning Engine

unique_units = set()
def engine_to_int(eng):
    eng = eng.split()
    unique_units.add(eng[1])
    return int(eng[0])
train_df['engine'] = train_df['engine'].apply(engine_to_int)
test_df['engine'] = test_df['engine'].apply(engine_to_int)

# print('Different units: ', unique_units)
train_df.plot.scatter(x='engine',
                      y='selling_price',
                      c='DarkBlue')

In [None]:
# Cleaning max power
unique_units = set()
def power_to_float(power):
    pow = power.split()
    unique_units.add(pow[1])
    return float(pow[0])
train_df['max_power'] = train_df['max_power'].apply(power_to_float)
test_df['max_power'] = test_df['max_power'].apply(power_to_float)

# print('Different units: ', unique_units)
train_df.plot.scatter(x='max_power',
                      y='selling_price',
                      c='DarkBlue')

In [None]:
# One hot encoding
# train
seller = pd.get_dummies(train_df.seller_type, prefix='seller', drop_first=True)
transmission = pd.get_dummies(train_df.transmission, prefix='transmission', drop_first=True)
fuel = pd.get_dummies(train_df.fuel, prefix='fuel', drop_first=True)
train_df = pd.concat([train_df, seller, transmission, fuel], axis=1)

# test
seller = pd.get_dummies(test_df.seller_type, prefix='seller', drop_first=True)
transmission = pd.get_dummies(test_df.transmission, prefix='transmission', drop_first=True)
fuel = pd.get_dummies(test_df.fuel, prefix='fuel', drop_first=True)
test_df = pd.concat([test_df, seller, transmission, fuel], axis=1)

drop_features = ['seller_type', 'transmission', 'fuel']
train_df.drop(drop_features, axis=1, inplace=True)
test_df.drop(drop_features, axis=1, inplace=True)

In [None]:
# Encoding owner

def encode_owner(owner_str):
    owner_enc_dict = {'Test Drive Car': 0,
                      'First Owner': 1,
                      'Second Owner': 2,
                      'Third Owner': 3,
                      'Fourth & Above Owner': 4}
    return owner_enc_dict[owner_str]

train_df['owner'] = train_df['owner'].apply(encode_owner)
test_df['owner'] = test_df['owner'].apply(encode_owner)

train_df.plot.scatter(x='owner',
                      y='selling_price',
                      c='DarkBlue')

In [None]:
# Cleaning torque

def clean_torque(torque):
    re_obj = re.search("^[\d,.]*", torque)
    torquef = float(re_obj.group())

    torque = torque.lower()
    if 'kgm' in torque and 'nm' not in torque:
        # print(torque)
        torquef = torquef * 9.80665

    return torquef

def clean_rpm_min(torque):
    torque = torque.lower()
    re_obj = re.findall("[\d,.]*", torque)
    # print(re_obj, torque)
    re_obj = [i for i in re_obj if i != '']
    re_obj = [i.replace(',', '') for i in re_obj]

    if 'nm' in torque and 'kgm' in torque:
        rpm = float(re_obj[-1])
        return rpm
        # print(torque, rpm)

    re_obj_tmp = re.match("[\d,.]*\([\d,.]*\)", torque)
    if re_obj_tmp:
        rpm = float(re_obj[-1])
        return rpm
        # print(torque, rpm)

    if len(re_obj) == 1:
        rpm = np.nan
    elif len(re_obj) == 2:
        rpm = float(re_obj[-1])
    elif len(re_obj) == 3:
        rpm = float(re_obj[-2])

    return rpm

def clean_rpm_max(torque):
    torque = torque.lower()
    re_obj = re.findall("[\d,.]*", torque)
    # print(re_obj, torque)
    re_obj = [i for i in re_obj if i != '']
    re_obj = [i.replace(',', '') for i in re_obj]

    if 'nm' in torque and 'kgm' in torque:
        rpm = float(re_obj[-1])
        return rpm
        # print(torque, rpm)

    re_obj_tmp = re.match("[\d,.]*\([\d,.]*\)", torque)
    if re_obj_tmp:
        rpm = float(re_obj[-1])
        return rpm
        # print(torque, rpm)

    if len(re_obj) == 1:
        rpm = np.nan
    elif len(re_obj) == 2:
        rpm = float(re_obj[-1])
    elif len(re_obj) == 3:
        rpm = float(re_obj[-1])

    return rpm

# feature 1 from torque - torque
train_df['torque_1'] = train_df['torque'].apply(clean_torque)
test_df['torque_1'] = test_df['torque'].apply(clean_torque)

# feature 2 from torque - min rpm
train_df['torque_rpm_min'] = train_df['torque'].apply(clean_rpm_min)
test_df['torque_rpm_min'] = test_df['torque'].apply(clean_rpm_min)

# feature 3 from torque - max rpm
train_df['torque_rpm_max'] = train_df['torque'].apply(clean_rpm_max)
test_df['torque_rpm_max'] = test_df['torque'].apply(clean_rpm_max)

# drop initial 'torque' feature
train_df.drop(['torque'], axis=1, inplace=True)
test_df.drop(['torque'], axis=1, inplace=True)

train_df.plot.scatter(x='torque_1',
                      y='selling_price',
                      c='DarkBlue')
train_df.plot.scatter(x='torque_rpm_min',
                      y='selling_price',
                      c='DarkBlue')
train_df.plot.scatter(x='torque_rpm_max',
                      y='selling_price',
                      c='DarkBlue')

In [None]:
# Cleaning and Encode brand

def get_brand(model):
    return model.split()[0]

def encode_brand(brnd):
    # ordered according to average selling price of each brand. Higher the order, Higher the
    # average selling price
    brands_enc_dict = {'Maruti': 8, 'Hyundai': 11, 'Mahindra': 15, 'Tata': 7, 'Ford': 13,
                       'Honda': 14, 'Toyota': 19, 'Chevrolet': 3, 'Renault': 9,
                       'Volkswagen': 12, 'Nissan': 10, 'Skoda': 16, 'Datsun': 6,
                       'Mercedes-Benz': 24, 'BMW': 27, 'Audi': 25, 'Fiat': 4, 'Jeep': 22,
                       'Volvo': 29, 'Mitsubishi': 17, 'Jaguar': 26, 'Ambassador': 2, 'Isuzu': 21,
                       'Force': 18, 'Daewoo': 1, 'Land': 28, 'Kia': 20, 'MG': 23, 'Lexus': 30,
                       'Ashok': 5, 'Opel': 0}
    return brands_enc_dict[brnd]

# clean
train_df['brand'] = train_df['name'].apply(get_brand)
test_df['brand'] = test_df['name'].apply(get_brand)

# encode
train_df['brand'] = train_df['brand'].apply(encode_brand)
test_df['brand'] = test_df['brand'].apply(encode_brand)

# drop 'name'
train_df.drop(['name'], axis=1, inplace=True)
test_df.drop(['name'], axis=1, inplace=True)

train_df.plot.scatter(x='brand',
                      y='selling_price',
                      c='DarkBlue')

In [None]:
# drop rows with NA values
train_df.dropna(axis=0, inplace=True)
test_df.dropna(axis=0, inplace=True)

In [None]:
# cleaning and encoding 'name'

def min_max_scaling(df):
    col_min_max = {"year": (1994, 2020), "km_driven": (1, 2360457), "owner": (0, 4),
                   "mileage": (9.0, 42.0), "engine": (624, 3604), "max_power": (32.8, 400.0),
                   "seats": (4, 14), "brand": (1, 30), "torque_1": (47.07192, 1863.2634999999998),
                   "torque_rpm_min": (1000.0, 21800.0), "torque_rpm_max": (500.0, 21800.0),
                   "seller_Individual": (0, 1), "seller_Trustmark Dealer": (0, 1),
                   "transmission_Manual": (0, 1), "fuel_Diesel": (0, 1), "fuel_LPG": (0, 1),
                   "fuel_Petrol": (0, 1)}
    # copy the dataframe
    df_norm = df.copy()
    # apply min-max scaling
    for column in df_norm.columns:
        # print("\"{}\": ({}, {})".format(column, df_norm[column].min(), df_norm[column].max()),
        #       end=", ")
        df_norm[column] = \
            (df_norm[column] - col_min_max[column][0]) / (col_min_max[column][1] - col_min_max[column][0])

    return df_norm

y_train = train_df['selling_price']
x_train = train_df.drop(['selling_price'], axis=1)

y_test = test_df['selling_price']
x_test = test_df.drop(['selling_price'], axis=1)

x_train_ns = x_train.copy()
x_test_ns = x_test.copy()

# Normalization x - min max scaling
x_train = min_max_scaling(x_train)
x_test = min_max_scaling(x_test)



In [None]:
# correlation
# calculate the correlation matrix
corr = train_df.corr()
print(corr['selling_price'])
# plot the heatmap
# sns.heatmap(corr['selling_price'])

In [None]:
# with scaling
pipe = make_pipeline(PCA(n_components=15),
                     MLPRegressor(random_state=1, max_iter=3000, hidden_layer_sizes=(100, 100),
                                  verbose=False))
pipe.fit(x_train, y_train)
print('Train score: ', pipe.score(x_train, y_train))
print('Test score:', pipe.score(x_test, y_test))

In [None]:
# without scaling
pipe = make_pipeline(PCA(n_components=15),
                     MLPRegressor(random_state=1, max_iter=3000, hidden_layer_sizes=(100, 100),
                                  verbose=False))
pipe.fit(x_train_ns, y_train)
print('Train score: ', pipe.score(x_train_ns, y_train))
print('Test score:', pipe.score(x_test_ns, y_test))