In [8]:
import numpy as np 
import pandas as pd
import feather
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
chunk_list=[]
chunksize = 10 ** 6
for chunk in pd.read_csv("data/train.csv", chunksize=chunksize):
  chunk['pickup_datetime'] = chunk['pickup_datetime'].str.slice(0, 16)
  chunk['pickup_datetime'] = pd.to_datetime(chunk['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
  chunk_list.append(chunk)
train_df = pd.concat(chunk_list)

In [None]:
train_df = pd.to_feather('data/train.feather')# converting in feather format
train_df = pd.read_feather('data/train.feather')# reading from feather format
train1_df = train_df[:10000000]

# 1 : Data cleaning

__Dropping Null Values__

In [None]:
print (train1_df.isnull().sum())# prints the count of null values for each coulmn

In [None]:
print('Old size: %d' % len(train_df))
train1_df = train1_df.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(train_df))

In [None]:
print (train_df.isnull().sum())

__Generating absolute latitude and longitude__

In [None]:
def abs_lat_long(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()
    
abs_lat_long(train1_df)

__PLotting the Scatter plot of abs_diff_longitutde and abs_diff_latitude__

In [None]:
plot=train1_df[:2000].plot.scatter('abs_diff_longitude', 'abs_diff_latitude')

__Removing the outliers__

In [None]:
#reference : starter code
train1_df = train1_df[(train1_df.abs_diff_longitude < 5.0) & (train1_df.abs_diff_latitude < 5.0)]#removing outliers

__ Limiting the fare amount__

In [None]:
train1_df= train1_df[train1_df.fare_amount>2]
train1_df= train1_df[train1_df.fare_amount<200]

__Adding Euclidian distance as feature__

In [None]:
train1_df['distance_travelled'] = (train1_df['abs_diff_longitude'] ** 2 + train1_df['abs_diff_latitude'] ** 2) ** .5

__Setting up Latitude and Longitude Range for ride__

In [None]:
train1_df = train1_df.loc[train1_df.pickup_longitude < -71]
train1_df = train1_df.loc[train1_df.pickup_longitude > -74.5]
train1_df = train1_df.loc[train1_df.pickup_latitude < 42]
train1_df = train1_df.loc[train1_df.pickup_latitude > 40]

In [18]:
print('Old size: %d' % len(train1_df))
train1_df = train1_df.loc[train1_df.dropoff_longitude < -71]
train1_df = train1_df.loc[train1_df.dropoff_longitude > -74.5]
train1_df = train1_df.loc[train1_df.dropoff_latitude < 42]
train1_df = train1_df.loc[train1_df.dropoff_latitude > 40]
print('new size: %d' % len(train1_df))

Old size: 9999681
new size: 9999055


__Limiting the Number of passengers to 7__

In [4]:
train1_df = train1_df.loc[train1_df.passenger_count > 0]
train1_df = train1_df.loc[train1_df.passenger_count <= 7]

__Adding Speherical distance__

In [13]:
def haversine(long1, lat1, long2, lat2):
    long1, lat1, long2, lat2 = map(np.radians, [long1, lat1, long2, lat2])
    a = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((long2-long1)/2.0)**2
    return 6367 * 2 * np.arcsin(np.sqrt(a)) *0.62137

train1_df['distance_sp'] = haversine(train1_df.pickup_longitude, train1_df.pickup_latitude,train1_df.dropoff_longitude,
                 train1_df.dropoff_latitude)


__Removing the Rows with distance zero__

In [None]:
train1_df=train1_df[train1_df.distance_travelled!=0]

__Adding date and time features__

In [14]:
def add_datetime_info(dataset):
    dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['year'] = dataset.pickup_datetime.dt.year
    
    return dataset

add_datetime_info(train1_df)


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_diff_longitude,abs_diff_latitude,distance_travelled,hour,day,month,weekday,year,distance_sp
0,4.500000,2009-06-15 17:26:21,-73.844315,40.721317,-73.841614,40.712276,1,0.002701,0.009041,0.009436,17,15,6,0,2009,0.640273
1,16.900000,2010-01-05 16:52:16,-74.016045,40.711304,-73.979271,40.782005,1,0.036774,0.070702,0.079693,16,5,1,1,2010,5.247133
2,5.700000,2011-08-18 00:35:00,-73.982735,40.761269,-73.991241,40.750561,2,0.008507,0.010708,0.013676,0,18,8,3,2011,0.862941
3,7.700000,2012-04-21 04:30:42,-73.987129,40.733143,-73.991570,40.758091,1,0.004440,0.024948,0.025340,4,21,4,5,2012,1.738424
4,5.300000,2010-03-09 07:51:00,-73.968094,40.768009,-73.956657,40.783764,1,0.011436,0.015755,0.019468,7,9,3,1,2010,1.241268
5,12.100000,2011-01-06 09:50:45,-74.000961,40.731628,-73.972893,40.758232,1,0.028069,0.026604,0.038673,9,6,1,3,2011,2.351610
6,7.500000,2012-11-20 20:35:00,-73.980003,40.751663,-73.973801,40.764843,1,0.006203,0.013180,0.014566,20,20,11,1,2012,0.966202
7,16.500000,2012-01-04 17:22:00,-73.951302,40.774139,-73.990097,40.751049,1,0.038795,0.023090,0.045147,17,4,1,2,2012,2.580443
8,9.000000,2012-12-03 13:10:00,-74.006462,40.726711,-73.993080,40.731628,1,0.013382,0.004917,0.014257,13,3,12,0,2012,0.778142
9,8.900000,2009-09-02 01:11:00,-73.980659,40.733871,-73.991539,40.758137,2,0.010880,0.024265,0.026593,1,2,9,2,2009,1.769486


__Saving data to feather file__

In [4]:
feather.write_dataframe(train1_df, 'train1.feather')

# 2 : Correlation

In [5]:
train1_df[['distance_travelled','fare_amount']].corr()

Unnamed: 0,distance_travelled,fare_amount
distance_travelled,1.0,0.844649
fare_amount,0.844649,1.0


In [6]:
train1_df[['distance_travelled','hour']].corr()

Unnamed: 0,distance_travelled,hour
distance_travelled,1.0,-0.028343
hour,-0.028343,1.0


In [39]:
train1_df[['fare_amount','hour']].corr()


Unnamed: 0,fare_amount,hour
fare_amount,1.0,-0.01707
hour,-0.01707,1.0


# 3 : Visualization

In [23]:
plt.style.use('seaborn-whitegrid')

In [None]:
train1_df.plot.scatter('fare', 'distance_travelled')

In [25]:
train1_df.plot.scatter('hour', 'distance_travelled')

Using matplotlib backend: Qt5Agg


<matplotlib.axes._subplots.AxesSubplot at 0x21123ececc0>

In [26]:
train1_df.plot.scatter('hour', 'fare_amount')

<matplotlib.axes._subplots.AxesSubplot at 0x21126741f60>

# 4 : Extra Visualization

In [40]:
# generating a plot of correlation between each pair of features (heat map)
%matplotlib
plt.subplots(figsize=(20,15))
matrix = train1_df.corr()
sns.heatmap(matrix)
plt.xticks(rotation=90)
plt.yticks(rotation=45)


Using matplotlib backend: Qt5Agg


(array([  0.5,   1.5,   2.5,   3.5,   4.5,   5.5,   6.5,   7.5,   8.5,
          9.5,  10.5,  11.5,  12.5,  13.5,  14.5,  15.5]),
 <a list of 16 Text yticklabel objects>)

# 5 : Adding another Feature 

In [24]:
from pandas.tseries.holiday import USFederalHolidayCalendar
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start='2009-01-01', end='2015-12-31').to_pydatetime()

train1_df['holiday_or_not'] = train1_df.pickup_datetime.apply(lambda x: 1 if x in holidays else 0)

# Training 

In [15]:
test_df = pd.read_csv('data/test.csv',low_memory=True)
test_df['abs_diff_latitude'] = (test_df.dropoff_latitude-test_df.pickup_latitude).abs()
test_df['abs_diff_longitude'] = (test_df.dropoff_longitude-test_df.pickup_longitude).abs()
test_df['distance_sp'] = haversine(test_df.pickup_longitude, test_df.pickup_latitude,test_df.dropoff_longitude,
                 test_df.dropoff_latitude)
test_df['distance_travelled']=(test_df['abs_diff_longitude'] ** 2 + test_df['abs_diff_latitude'] ** 2) ** .5
add_datetime_info(test_df)
test_df_id = list(test_df.pop('key'))

__Linear Regression__

In [41]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
p=lr.fit(train1_df[:10000000][['abs_diff_latitude','abs_diff_longitude','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','distance_travelled','passenger_count']], train1_df[:10000000]['fare_amount'])



In [45]:
lr.coef_

array([ -1.67776978e+02,  -1.71950317e+02,  -1.90558491e+01,
         2.55115843e+00,  -8.69581985e+00,  -3.21768951e+00,
         4.16982910e+02,   3.93426120e-02], dtype=float32)

In [13]:
preds_lr = lr.predict(test_df[['abs_diff_latitude','abs_diff_longitude','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','distance_travelled','passenger_count']])
sub=pd.DataFrame({'key':test_df_id,'fare_amount':preds_lr})
sub.to_csv('submission.csv',index=False)

__Random Forest__

In [17]:
from sklearn.ensemble import RandomForestRegressor as rf
random_forest = rf(n_estimators = 10, max_depth = 10, max_features = None, oob_score = True, bootstrap = True, verbose = 1, n_jobs = -1)
random_forest.fit(train1_df[:10000000][['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','distance_travelled','passenger_count','hour']], train1_df[:10000000]['fare_amount'])



[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.3min finished
  warn("Some inputs do not have OOB scores. "


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=True, random_state=None, verbose=1, warm_start=False)

In [18]:
preds_lr = random_forest.predict(test_df[['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','distance_travelled','passenger_count','hour']])
sub=pd.DataFrame({'key':test_df_id,'fare_amount':preds_lr})
sub.to_csv('submission1.csv',index=False)

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
