In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_excel('Data_Train.xlsx')
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


# dropping nan values

In [3]:
df.dropna(inplace=True)

In [4]:

df.shape


(10682, 11)

# dropping duplicate rows

In [7]:
df=df.drop_duplicates()

In [8]:
df.shape

(10462, 11)

# Data Preprocessing

In [10]:
# Handling Date-Time Data

# We need to convert Date of journey to timestamp to use it properly for prediction

In [11]:
df["Journey_Day"]=pd.to_datetime(df['Date_of_Journey'],format="%d/%m/%Y").dt.day
df["Journey_Month"]=pd.to_datetime(df['Date_of_Journey'],format="%d/%m/%Y").dt.month
df.drop(['Date_of_Journey'],axis=1,inplace=True)
df.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,9,6
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,12,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,1,3


In [12]:
# Converting Departure Time to date time format

In [13]:
df['dep_mins']=pd.to_datetime(df['Dep_Time']).dt.minute
df['dep_hrs']=pd.to_datetime(df['Dep_Time']).dt.hour
df.drop(['Dep_Time'],axis=1,inplace=True)
df.head()

Unnamed: 0,Airline,Source,Destination,Route,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,dep_mins,dep_hrs
0,IndiGo,Banglore,New Delhi,BLR → DEL,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,20,22
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,13:15,7h 25m,2 stops,No info,7662,1,5,50,5
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,04:25 10 Jun,19h,2 stops,No info,13882,9,6,25,9
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,23:30,5h 25m,1 stop,No info,6218,12,5,5,18
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,21:35,4h 45m,1 stop,No info,13302,1,3,50,16


In [14]:
# Converting Arrival Time to date time format

In [15]:
df['arrival_mins']=pd.to_datetime(df['Arrival_Time']).dt.minute
df['arrival_hrs']=pd.to_datetime(df['Arrival_Time']).dt.hour
df.drop(['Arrival_Time'],axis=1,inplace=True)
df.head()


Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,dep_mins,dep_hrs,arrival_mins,arrival_hrs
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897,24,3,20,22,10,1
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662,1,5,50,5,15,13
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2 stops,No info,13882,9,6,25,9,25,4
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1 stop,No info,6218,12,5,5,18,30,23
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1 stop,No info,13302,1,3,50,16,35,21


In [16]:
# Splitting Flight Duration into Hours and Minutes

In [17]:
duration = list(df["Duration"])

for i in range(len(duration)):
    if len(duration[i].split()) != 2:
        if "h" in duration[i]:
            duration[i]= duration[i].strip() + ' 0m'
        else:
            duration[i]= '0h ' +duration[i]
            
duration_hours=[]
duration_mins=[]
for i in range(len(duration)):
    duration_hours.append(int(duration[i].split(sep='h')[0]))
    duration_mins.append(int(duration[i].split(sep='m')[0].split()[1]))

In [18]:
df['Duration_hours']=duration_hours
df['Duration_mins']= duration_mins
df.drop(['Duration'],axis=1,inplace=True)
df.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,dep_mins,dep_hrs,arrival_mins,arrival_hrs,Duration_hours,Duration_mins
0,IndiGo,Banglore,New Delhi,BLR → DEL,non-stop,No info,3897,24,3,20,22,10,1,2,50
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2 stops,No info,7662,1,5,50,5,15,13,7,25
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2 stops,No info,13882,9,6,25,9,25,4,19,0
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,1 stop,No info,6218,12,5,5,18,30,23,5,25
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,1 stop,No info,13302,1,3,50,16,35,21,4,45


In [19]:
# Separting Dependent features and Target Feature Price

In [20]:
X=df.drop(['Price'],axis=1)
y=df.iloc[:,6]


In [22]:
# Train Test Split

In [23]:
#now before handling Categorical data we will perform train and validation data to avoid Data leakage
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

# Handling Categorical Data (For Training Set)

In [24]:
#Airline
print('Airline')
print('-'*7)
print(X_train['Airline'].value_counts())
print('\n')

#Source
print('Source')
print('-'*6)
print(X_train['Source'].value_counts())
print('\n')

#Destination
print('Destination')
print('-'*11)
print(X_train['Destination'].value_counts())
print('\n')

#Total Stops
print('Total_Stops')
print('-'*11)
print(X_train['Total_Stops'].value_counts())
print('\n')

#Additional Info
print('Additional Info')
print('-'*15)
print(X_train['Additional_Info'].value_counts())
print('\n')

Airline
-------
Jet Airways                          2954
IndiGo                               1629
Air India                            1377
Multiple carriers                     957
SpiceJet                              618
Vistara                               388
Air Asia                              266
GoAir                                 161
Multiple carriers Premium economy      11
Jet Airways Business                    5
Vistara Premium economy                 2
Trujet                                  1
Name: Airline, dtype: int64


Source
------
Delhi       3481
Kolkata     2296
Banglore    1757
Mumbai       544
Chennai      291
Name: Source, dtype: int64


Destination
-----------
Cochin       3481
Banglore     2296
Delhi        1027
New Delhi     730
Hyderabad     544
Kolkata       291
Name: Destination, dtype: int64


Total_Stops
-----------
1 stop      4495
non-stop    2768
2 stops     1070
3 stops       35
4 stops        1
Name: Total_Stops, dtype: int64


Additional In

In [25]:
# Airline Feature(Categories renaming)

# We can change Multiple carriers Premium economy, Jet Airways Business, Vistara Premium economy to Multiple carriers, Jet Airways, Vistara respectively.

In [26]:
Airline_Dict={'Multiple carriers Premium economy':'Multiple carriers' ,
              'Jet Airways Business':'Jet Airways',
              'Vistara Premium economy':'Vistara'}

X_train['Airline']=X_train['Airline'].replace(Airline_Dict)


In [27]:
# we need to drop Trujet because Airline feature has high cardinality and this record is present only 1 time so our model cant learn enough

In [28]:
X_train[X_train['Airline']=='Trujet']

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Journey_Day,Journey_Month,dep_mins,dep_hrs,arrival_mins,arrival_hrs,Duration_hours,Duration_mins
2878,Trujet,Mumbai,Hyderabad,BOM → NDC → HYD,1 stop,No info,6,3,5,13,20,16,3,15


In [29]:
np.where(X_train['Airline']=='Trujet')

(array([7659], dtype=int64),)

In [30]:
X_train.iloc[7659]


Airline                     Trujet
Source                      Mumbai
Destination              Hyderabad
Route              BOM → NDC → HYD
Total_Stops                 1 stop
Additional_Info            No info
Journey_Day                      6
Journey_Month                    3
dep_mins                         5
dep_hrs                         13
arrival_mins                    20
arrival_hrs                     16
Duration_hours                   3
Duration_mins                   15
Name: 2878, dtype: object

In [31]:
X_train.drop(X_train.index[7659],inplace=True)
y_train.drop(y_train.index[7659],inplace=True)

# Feature Categorical Encoding

In [32]:
encoded_feat=X_train[['Airline','Source','Destination']]
encoded_feat=pd.get_dummies(encoded_feat,drop_first=False)
encoded_feat.head()

Unnamed: 0,Airline_Air Asia,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Multiple carriers,Airline_SpiceJet,Airline_Vistara,Source_Banglore,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
516,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0
3764,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
6805,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1
1409,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
4754,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0


In [33]:
# Total Stops(Label encoding)

In [34]:
X_train['Total_Stops'].unique()

array(['non-stop', '1 stop', '2 stops', '3 stops', '4 stops'],
      dtype=object)

In [35]:
#Label Encoding For Total Stops Feature
Stops={'non-stop':0,'1 stop':1,'2 stops':2,'3 stops':3,'4 stops':4}
X_train['Total_Stops']=X_train['Total_Stops'].map(Stops)

# Additional Info

In [36]:
#Additional Info Feature
X_train['Additional_Info'].value_counts()/len(X_train)*100

No info                         78.381931
In-flight meal not included     18.343690
No check-in baggage included     2.903920
1 Long layover                   0.215105
Change airports                  0.059751
No Info                          0.035851
Business class                   0.023901
1 Short layover                  0.011950
2 Long layover                   0.011950
Red-eye flight                   0.011950
Name: Additional_Info, dtype: float64

# No info Category is present more than 75% of total records this means it is not a signinficant feature so we can drop Additional Info

In [38]:

X_train.drop(['Additional_Info'],axis=1,inplace=True)

In [39]:
#Now we concatenate previous categorical Data
X_train= pd.concat([X_train,encoded_feat],axis=1)
X_train.drop(['Airline','Source','Destination','Route'],axis=1,inplace=True)
X_train.head()

Unnamed: 0,Total_Stops,Journey_Day,Journey_Month,dep_mins,dep_hrs,arrival_mins,arrival_hrs,Duration_hours,Duration_mins,Airline_Air Asia,...,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
516,0,27,5,5,7,20,9,2,15,0,...,1,0,0,0,0,0,0,0,1,0
3764,0,9,6,10,22,0,1,2,50,0,...,0,0,0,0,0,0,1,0,0,0
6805,0,3,3,10,21,5,0,2,55,0,...,0,0,0,0,0,0,0,0,0,1
1409,1,1,6,0,17,30,1,8,30,0,...,0,1,0,0,0,1,0,0,0,0
4754,1,9,6,0,14,35,12,22,35,0,...,0,1,0,0,0,1,0,0,0,0


In [40]:
# dropping Route because Stops and Route are correlated

# Handling Categorical Data (For Validation Set)

In [41]:
#Airline
print('Airline')
print('-'*7)
print(X_test['Airline'].value_counts())
print('\n')

#Source
print('Source')
print('-'*6)
print(X_test['Source'].value_counts())
print('\n')

#Destination
print('Destination')
print('-'*11)
print(X_test['Destination'].value_counts())
print('\n')

#Total Stops
print('Total_Stops')
print('-'*11)
print(X_test['Total_Stops'].value_counts())
print('\n')

#Additional Info
print('Additional Info')
print('-'*15)
print(X_test['Additional_Info'].value_counts())
print('\n')

Airline
-------
Jet Airways                          746
IndiGo                               414
Air India                            317
Multiple carriers                    239
SpiceJet                             197
Vistara                               90
Air Asia                              53
GoAir                                 33
Multiple carriers Premium economy      2
Jet Airways Business                   1
Vistara Premium economy                1
Name: Airline, dtype: int64


Source
------
Delhi       864
Kolkata     564
Banglore    422
Mumbai      153
Chennai      90
Name: Source, dtype: int64


Destination
-----------
Cochin       864
Banglore     564
Delhi        238
New Delhi    184
Hyderabad    153
Kolkata       90
Name: Destination, dtype: int64


Total_Stops
-----------
1 stop      1130
non-stop     707
2 stops      248
3 stops        8
Name: Total_Stops, dtype: int64


Additional Info
---------------
No info                         1622
In-flight meal not includ

In [42]:
# Airline Feature(Categories renaming)

# We can change Multiple carriers Premium economy, Jet Airways Business, Vistara Premium economy to Multiple carriers, Jet Airways, Vistara respectively.

In [43]:
X_test['Airline']=X_test['Airline'].replace(Airline_Dict)

# Feature Categorical Encoding

In [44]:
encoded_feat=X_test[['Airline','Source','Destination']]
encoded_feat=pd.get_dummies(encoded_feat,drop_first=False)
encoded_feat.head()

Unnamed: 0,Airline_Air Asia,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Multiple carriers,Airline_SpiceJet,Airline_Vistara,Source_Banglore,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
3931,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
9004,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
7638,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
5591,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2937,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0


# Total Stops(Label Encoding)

In [45]:
X_test['Total_Stops'].unique()

array(['non-stop', '1 stop', '2 stops', '3 stops'], dtype=object)

In [46]:
#Label Encoding For Total Stops Feature
Stops={'non-stop':0,'1 stop':1,'2 stops':2,'3 stops':3}
X_test['Total_Stops']=X_test['Total_Stops'].map(Stops)

# Additional_Info

In [47]:
#Additional Info Feature
X_test['Additional_Info'].value_counts()/len(X_test)*100

No info                         77.496417
In-flight meal not included     18.681319
No check-in baggage included     3.583373
Business class                   0.095557
Change airports                  0.095557
1 Long layover                   0.047778
Name: Additional_Info, dtype: float64

In [48]:
# No info Category is present more than 75% of total records this means it is not a signinficant feature so we can drop Additional Info

In [49]:
X_test.drop(['Additional_Info'],axis=1,inplace=True)

In [50]:
#Now we concatenate previous categorical Data
X_test= pd.concat([X_test,encoded_feat],axis=1)
X_test.drop(['Airline','Source','Destination','Route'],axis=1,inplace=True)
X_test.head()

Unnamed: 0,Total_Stops,Journey_Day,Journey_Month,dep_mins,dep_hrs,arrival_mins,arrival_hrs,Duration_hours,Duration_mins,Airline_Air Asia,...,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
3931,0,1,3,20,13,35,15,2,15,0,...,1,0,0,0,0,0,0,0,1,0
9004,1,9,4,5,7,10,18,11,5,0,...,0,1,0,0,0,1,0,0,0,0
7638,1,21,5,20,10,15,19,8,55,0,...,0,1,0,0,0,1,0,0,0,0
5591,1,21,3,40,11,25,11,23,45,0,...,0,0,0,0,0,0,0,0,0,1
2937,1,9,6,30,20,25,20,23,55,0,...,0,0,1,0,1,0,0,0,0,0


# Feature Scaling

In [51]:
#scaling down Train data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_train_scaled

array([[-1.21653397,  1.59334992,  0.24625653, ..., -0.26342546,
         5.26839764, -0.30915167],
       [-1.21653397, -0.52906877,  1.10882442, ..., -0.26342546,
        -0.18981103, -0.30915167],
       [-1.21653397, -1.23654167, -1.47887923, ..., -0.26342546,
        -0.18981103,  3.2346582 ],
       ...,
       [-1.21653397, -1.23654167, -0.61631135, ..., -0.26342546,
        -0.18981103, -0.30915167],
       [-1.21653397,  1.59334992,  0.24625653, ...,  3.79614033,
        -0.18981103, -0.30915167],
       [-1.21653397, -1.47236597,  0.24625653, ..., -0.26342546,
        -0.18981103, -0.30915167]])

In [52]:
#scaling down Test data
X_test_scaled=scaler.transform(X_test)
X_test_scaled

array([[-1.21653397, -1.47236597, -1.47887923, ..., -0.26342546,
         5.26839764, -0.30915167],
       [ 0.29317332, -0.52906877, -0.61631135, ..., -0.26342546,
        -0.18981103, -0.30915167],
       [ 0.29317332,  0.88587702,  0.24625653, ..., -0.26342546,
        -0.18981103, -0.30915167],
       ...,
       [-1.21653397,  1.59334992,  0.24625653, ..., -0.26342546,
        -0.18981103, -0.30915167],
       [-1.21653397, -1.23654167, -0.61631135, ..., -0.26342546,
        -0.18981103, -0.30915167],
       [-1.21653397, -0.52906877,  1.10882442, ..., -0.26342546,
        -0.18981103, -0.30915167]])

In [53]:

df_Xscaled=pd.DataFrame(X_train_scaled,columns=X_train.columns)

In [54]:

df_train_scaled=pd.concat([df_Xscaled,y_train.reset_index(drop=True)],axis=1)
df_train_scaled.head()

Unnamed: 0,Total_Stops,Journey_Day,Journey_Month,dep_mins,dep_hrs,arrival_mins,arrival_hrs,Duration_hours,Duration_mins,Airline_Air Asia,...,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi,Price
0,-1.216534,1.59335,0.246257,-1.032887,-0.952243,-0.282196,-0.644279,-0.962574,-0.784558,-0.181194,...,-0.843977,-0.614922,-0.263425,-0.614922,-0.843977,-0.374031,-0.263425,5.268398,-0.309152,3687
1,-1.216534,-0.529069,1.108824,-0.766958,1.657713,-1.48531,-1.809154,-0.962574,1.283029,-0.181194,...,-0.843977,-0.614922,-0.263425,-0.614922,-0.843977,2.673575,-0.263425,-0.189811,-0.309152,3749
2,-1.216534,-1.236542,-1.478879,-0.766958,1.483716,-1.184531,-1.954763,-0.962574,1.578398,-0.181194,...,-0.843977,-0.614922,-0.263425,-0.614922,-0.843977,-0.374031,-0.263425,-0.189811,3.234658,7608
3,0.293173,-1.472366,1.108824,-1.298815,0.787728,0.319362,-1.809154,-0.246558,0.101551,-0.181194,...,1.184866,-0.614922,-0.263425,-0.614922,1.184866,-0.374031,-0.263425,-0.189811,-0.309152,7408
4,0.293173,-0.529069,1.108824,-1.298815,0.265736,0.62014,-0.207451,1.424145,0.39692,-0.181194,...,1.184866,-0.614922,-0.263425,-0.614922,1.184866,-0.374031,-0.263425,-0.189811,-0.309152,10262


In [55]:

df_Xscaledtest=pd.DataFrame(X_test_scaled,columns=X_test.columns)

In [56]:
df_test_scaled=pd.concat([df_Xscaledtest,y_test.reset_index(drop=True)],axis=1)
df_test_scaled.head()


Unnamed: 0,Total_Stops,Journey_Day,Journey_Month,dep_mins,dep_hrs,arrival_mins,arrival_hrs,Duration_hours,Duration_mins,Airline_Air Asia,...,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi,Price
0,-1.216534,-1.472366,-1.478879,-0.235102,0.091739,0.62014,0.229376,-0.962574,-0.784558,-0.181194,...,-0.843977,-0.614922,-0.263425,-0.614922,-0.843977,-0.374031,-0.263425,5.268398,-0.309152,7295
1,0.293173,-0.529069,-0.616311,-1.032887,-0.952243,-0.883753,0.666204,0.11145,-1.375297,-0.181194,...,1.184866,-0.614922,-0.263425,-0.614922,1.184866,-0.374031,-0.263425,-0.189811,-0.309152,6601
2,0.293173,0.885877,0.246257,-0.235102,-0.430252,-0.582974,0.811814,-0.246558,1.578398,-0.181194,...,1.184866,-0.614922,-0.263425,-0.614922,1.184866,-0.374031,-0.263425,-0.189811,-0.309152,8266
3,0.293173,0.885877,-1.478879,0.82861,-0.256255,0.018583,-0.353061,1.543481,0.987659,-0.181194,...,-0.843977,-0.614922,-0.263425,-0.614922,-0.843977,-0.374031,-0.263425,-0.189811,3.234658,7832
4,0.293173,-0.529069,1.108824,0.296754,1.309719,0.018583,0.957423,1.543481,1.578398,-0.181194,...,-0.843977,1.626222,-0.263425,1.626222,-0.843977,-0.374031,-0.263425,-0.189811,-0.309152,10203


# Saving Scaled values into csv file

In [57]:
df_train_scaled.to_csv('preprocessedtraindata.csv',index_label=False)
df_test_scaled.to_csv('preprocessedtestdata.csv',index_label=False)


In [58]:
import joblib
scaler_filename = "standardscaler.save"
joblib.dump(scaler, scaler_filename)

['standardscaler.save']