In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv("./test_data.csv")

In [4]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price
0,Mahindra XUV500 W6 2WD,Kochi,2017,41290,Diesel,Manual,Second,16.0 kmpl,2179 CC,140 bhp,7.0,
1,BMW 5 Series 520d Luxury Line,Delhi,2013,65000,Diesel,Automatic,First,22.48 kmpl,1995 CC,190 bhp,5.0,67.87 Lakh
2,Toyota Corolla Altis 1.8 G,Bangalore,2016,16471,Petrol,Manual,Second,14.28 kmpl,1798 CC,138.03 bhp,5.0,20.48 Lakh
3,Hyundai i20 Asta 1.4 CRDi,Pune,2015,54339,Diesel,Manual,Second,22.54 kmpl,1396 CC,88.73 bhp,5.0,
4,Mahindra KUV 100 mFALCON G80 K2,Hyderabad,2016,24025,Petrol,Manual,First,18.15 kmpl,1198 CC,82 bhp,6.0,


In [5]:
df.dtypes

Name                  object
Location              object
Year                   int64
Kilometers_Driven      int64
Fuel_Type             object
Transmission          object
Owner_Type            object
Mileage               object
Engine                object
Power                 object
Seats                float64
New_Price             object
dtype: object

In [6]:
df.isna().sum()

Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 14
Power                  14
Seats                  17
New_Price            2177
dtype: int64

In [7]:
df=df.dropna(subset=["Mileage", "Engine", "Power", "Seats"])

In [8]:
df.columns

Index(['Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'New_Price'],
      dtype='object')

In [9]:
df = df.drop("New_Price", axis='columns')

In [10]:
df.shape

(2501, 11)

In [11]:
df["Mileage"]

0        16.0 kmpl
1       22.48 kmpl
2       14.28 kmpl
3       22.54 kmpl
4       18.15 kmpl
           ...    
2515     28.4 kmpl
2516     24.4 kmpl
2517     14.0 kmpl
2518     18.9 kmpl
2519    25.44 kmpl
Name: Mileage, Length: 2501, dtype: object

In [12]:
count_kmkg = 0
count_kmpl = 0
for i in df["Mileage"]:
    if "km/kg" in str(i):
        count_kmkg += 1
    elif "kmpl" in str(i):
        count_kmpl += 1
print('The no of rows with km/kg: ', count_kmkg)
print('The no of rows with kmpl: ', count_kmpl)

The no of rows with km/kg:  24
The no of rows with kmpl:  2477


In [13]:
fuel_density = {'CNG': 0.128, 'Diesel': 0.832, 'Petrol': 0.745, 'LPG': 0.579}

Transformed_Mileage = []
for i, fuel_type in zip(df['Mileage'], df['Fuel_Type']):
    if "kmpl" in str(i):
        i = i[:-5]
        i = float(i) / fuel_density[fuel_type] if fuel_type in fuel_density else float(i)
    elif str(i).endswith('km/kg'):
        i = i[:-6]
    Transformed_Mileage.append(float(i))

In [14]:
df["Mileage"] = Transformed_Mileage

In [15]:
df['Engine'] = df['Engine'].str.replace(' CC', '')       
df['Engine'] = df['Engine'].astype('float64')

In [16]:
df['Power'] = df['Power'].str.replace(' bhp', '')    

In [17]:
df['Power'] = df['Power'].replace('null', np.nan)
df['Power'] = df['Power'].astype('float64')

In [18]:
df.isnull().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               0
Engine                0
Power                39
Seats                 0
dtype: int64

In [19]:
df=df.dropna(subset=["Power"])

In [20]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats
0,Mahindra XUV500 W6 2WD,Kochi,2017,41290,Diesel,Manual,Second,19.230769,2179.0,140.0,7.0
1,BMW 5 Series 520d Luxury Line,Delhi,2013,65000,Diesel,Automatic,First,27.019231,1995.0,190.0,5.0
2,Toyota Corolla Altis 1.8 G,Bangalore,2016,16471,Petrol,Manual,Second,19.167785,1798.0,138.03,5.0
3,Hyundai i20 Asta 1.4 CRDi,Pune,2015,54339,Diesel,Manual,Second,27.091346,1396.0,88.73,5.0
4,Mahindra KUV 100 mFALCON G80 K2,Hyderabad,2016,24025,Petrol,Manual,First,24.362416,1198.0,82.0,6.0


In [21]:
df = df.drop("Name", axis='columns')

In [23]:
Location = df[["Location"]]
Location = pd.get_dummies(Location) #drop first is false

In [25]:
Fuel_Type = df[["Fuel_Type"]]
Fuel_Type = pd.get_dummies(Fuel_Type) #drop first is false

In [26]:
Transmission = df[["Transmission"]]
Transmission = pd.get_dummies(Transmission,drop_first=True)

In [27]:
label_dict = {'First': 0, 'Second': 1, 'Third': 2, 'Fourth & Above': 3}
df['Owner_Type_Encoded'] = df['Owner_Type'].map(label_dict)

df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Owner_Type_Encoded
0,Kochi,2017,41290,Diesel,Manual,Second,19.230769,2179.0,140.0,7.0,1
1,Delhi,2013,65000,Diesel,Automatic,First,27.019231,1995.0,190.0,5.0,0
2,Bangalore,2016,16471,Petrol,Manual,Second,19.167785,1798.0,138.03,5.0,1
3,Pune,2015,54339,Diesel,Manual,Second,27.091346,1396.0,88.73,5.0,1
4,Hyderabad,2016,24025,Petrol,Manual,First,24.362416,1198.0,82.0,6.0,0


In [33]:
test_df= pd.concat([df,Location,Fuel_Type,Transmission],axis=1)

In [34]:
test_df = test_df.drop(["Location","Fuel_Type","Transmission","Owner_Type"],axis='columns')

In [35]:
test_df.head()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Owner_Type_Encoded,Location_Ahmedabad,Location_Bangalore,Location_Chennai,...,Location_Jaipur,Location_Kochi,Location_Kolkata,Location_Mumbai,Location_Pune,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_LPG,Fuel_Type_Petrol,Transmission_Manual
0,2017,41290,19.230769,2179.0,140.0,7.0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,1
1,2013,65000,27.019231,1995.0,190.0,5.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,2016,16471,19.167785,1798.0,138.03,5.0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,1
3,2015,54339,27.091346,1396.0,88.73,5.0,1,0,0,0,...,0,0,0,0,1,0,1,0,0,1
4,2016,24025,24.362416,1198.0,82.0,6.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [31]:
test_df.columns

Index(['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats',
       'Owner_Type_Encoded', 'Location_Ahmedabad', 'Location_Bangalore',
       'Location_Chennai', 'Location_Coimbatore', 'Location_Delhi',
       'Location_Hyderabad', 'Location_Jaipur', 'Location_Kochi',
       'Location_Kolkata', 'Location_Mumbai', 'Location_Pune', 'Fuel_Type_CNG',
       'Fuel_Type_Diesel', 'Fuel_Type_LPG', 'Fuel_Type_Petrol',
       'Transmission_Manual'],
      dtype='object')

In [36]:
test_df = np.array(test_df)

In [37]:
mean = [2.01344575e+03, 6.02636521e+04, 2.29749528e+01, 1.62325660e+03,
       1.13124335e+02, 5.29178886e+00, 2.02346041e-01, 3.59237537e-02,
       6.41495601e-02, 7.99120235e-02, 1.05938416e-01, 9.64076246e-02,
       1.08137830e-01, 7.11143695e-02, 1.09604106e-01, 9.31085044e-02,
       1.35630499e-01, 1.00073314e-01, 1.06304985e-02, 5.40322581e-01,
       2.19941349e-03, 4.46847507e-01, 7.13709677e-01]

std = [3.13655317e+00, 1.29997668e+05, 5.27023100e+00, 5.85343287e+02,
       5.27055317e+01, 8.13588786e-01, 4.55620241e-01, 1.86100074e-01,
       2.45019171e-01, 2.71156951e-01, 3.07758783e-01, 2.95149444e-01,
       3.10554407e-01, 2.57015789e-01, 3.12395656e-01, 2.90584430e-01,
       3.42395775e-01, 3.00097727e-01, 1.02554820e-01, 4.98371437e-01,
       4.68463026e-02, 4.97166785e-01, 4.52026740e-01]

var = [9.83796579e+00, 1.68993936e+10, 2.77753348e+01, 3.42626764e+05,
       2.77787307e+03, 6.61926712e-01, 2.07589804e-01, 3.46332376e-02,
       6.00343941e-02, 7.35260920e-02, 9.47154683e-02, 8.71131945e-02,
       9.64440397e-02, 6.60571160e-02, 9.75910456e-02, 8.44393108e-02,
       1.17234866e-01, 9.00586457e-02, 1.05174910e-02, 2.48374089e-01,
       2.19457607e-03, 2.47174813e-01, 2.04328174e-01]  

n_samples_seen=2728

In [40]:
scaler = StandardScaler()
scaler.mean_ = mean
scaler.scale_ = std
scaler.var_ = var
scaler.n_samples_seen_=n_samples_seen

In [41]:
inp = scaler.transform(test_df)

In [42]:
with open('ada_reg_xg1.pkl', 'rb') as f:
    model = pickle.load(f)
    print("Model Loaded")

Model Loaded


In [45]:
prediction = model.predict(inp)

In [47]:
for p in prediction:
    print()

array([12.274903 , 16.748812 , 10.2911825, ...,  6.033526 ,  2.2198777,
        2.5210886], dtype=float32)

In [59]:
df_pred = pd.DataFrame({'Predicted_value': prediction})

In [60]:
df_pred.head()

Unnamed: 0,Predicted_value
0,12.274903
1,16.748812
2,10.291183
3,5.770888
4,5.616024


In [62]:
df_pred.shape

(2462, 1)

In [63]:
df_pred.to_csv('test_data_predictions.csv', index=False)