In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from haversine import haversine
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
import warnings
warnings.filterwarnings('ignore')



print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test.csv', 'test', 'sample_submission', 'train', 'train.csv', 'sample_submission.csv']


# 1 Importation des données

In [3]:
# Importation du fichier CSV (en indiquant que la colonne IDENTITY est la colonne id du dataset)
data = pd.read_csv('../input/train.csv') #, index_col = 0)
test = pd.read_csv('../input/test.csv') #, index_col = 0)

# 2 Analyse des données

In [None]:
data.head()

# 3 Feature selection

In [None]:
SELECTED_COLUMNS = ['passenger_count']
X = data[SELECTED_COLUMNS]
X.head()
y = np.log1p(data['trip_duration'])


# Tester Choix du modèle avec une seule feature

In [None]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X, y)

# Tester Cross Validation avec une seule feature

In [None]:
X.shape

In [None]:
cv_scores = -cross_val_score(rf, X, y, cv=3, scoring='neg_mean_squared_error')
cv_scores

In [None]:
np.mean(cv_scores)

In [None]:
X_test = test[SELECTED_COLUMNS]
predictions = np.exp(rf.predict(X_test))-np.ones(len(X_test))

X_test.shape
pred = pd.DataFrame(predictions, index=test['id'])
pred.columns = ['trip_duration']
pred.to_csv("submission_.csv")

pd.read_csv('submission_.csv').head()

# 4 Preprocessing

In [None]:
# Convertir les dates de timestamp en datetime afin d'extraire d'autres détails importants de la date
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
data['dropoff_datetime'] = pd.to_datetime(data['dropoff_datetime'])

# Convertir les dates de timestamp en datetime afin d'extraire d'autres détails importants de la date
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])

In [None]:
# Extraction, Calcul et affectation des nouvelles données relative à la pickup_date dans le dataset
data['weekday'] = data.pickup_datetime.dt.weekday_name
data['month'] = data.pickup_datetime.dt.month
data['weekday_num'] = data.pickup_datetime.dt.weekday
data['pickup_hour'] = data.pickup_datetime.dt.hour

# Extraction, Calcul et affectation des nouvelles données relative à la pickup_date dans le dataset
test['weekday'] = test.pickup_datetime.dt.weekday_name
test['month'] = test.pickup_datetime.dt.month
test['weekday_num'] = test.pickup_datetime.dt.weekday
test['pickup_hour'] = test.pickup_datetime.dt.hour

In [None]:
# Fonction de calcul de distance entre les points de départs et les points d'arrivées
# Elle prend en paramètre le dataset, et renvoie un vecteur contenant les distances entre ces points
# Elle applique la méthode de Haversine pour le calcul des distances entre deux coordonnées
def calcul_distance(df):
    pickedup = (df['pickup_latitude'], df['pickup_longitude'])
    dropoff = (df['dropoff_latitude'], df['dropoff_longitude'])
    return haversine(pickedup, dropoff)

In [None]:
# Calcul des distances entre les points de départs et les points d'arrivées
# et les mettant dans une nouvelle colonne distance
data['distance'] = data.apply(lambda x : calcul_distance(x), axis = 1)
test['distance'] = test.apply(lambda x : calcul_distance(x), axis = 1)

In [None]:
data.dtypes.reset_index()
test.dtypes.reset_index()

In [None]:
# Découper les features catégoriques en plusieurs variables numériques / indicatrices

dummy = pd.get_dummies(data.store_and_fwd_flag, prefix='flag')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #enlever la première colonne qui est l'index
data = pd.concat([data,dummy], axis = 1)

dummy = pd.get_dummies(data.vendor_id, prefix='vendor_id')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #enlever la première colonne qui est l'index
data = pd.concat([data,dummy], axis = 1)

dummy = pd.get_dummies(data.month, prefix='month')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #enlever la première colonne qui est l'index
data = pd.concat([data,dummy], axis = 1)

dummy = pd.get_dummies(data.weekday_num, prefix='weekday_num')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #enlever la première colonne qui est l'index
data = pd.concat([data,dummy], axis = 1)

dummy = pd.get_dummies(data.pickup_hour, prefix='pickup_hour')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #enlever la première colonne qui est l'index
data = pd.concat([data,dummy], axis = 1)

dummy = pd.get_dummies(data.passenger_count, prefix='passenger_count')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #enlever la première colonne qui est l'index
data = pd.concat([data,dummy], axis = 1)

In [None]:
# Découper les features catégoriques en plusieurs variables numériques / indicatrices

dummy = pd.get_dummies(test.store_and_fwd_flag, prefix='flag')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #enlever la première colonne qui est l'index
test = pd.concat([test,dummy], axis = 1)

dummy = pd.get_dummies(test.vendor_id, prefix='vendor_id')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #enlever la première colonne qui est l'index
test = pd.concat([test,dummy], axis = 1)

dummy = pd.get_dummies(test.month, prefix='month')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #enlever la première colonne qui est l'index
test = pd.concat([test,dummy], axis = 1)

dummy = pd.get_dummies(test.weekday_num, prefix='weekday_num')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #enlever la première colonne qui est l'index
test = pd.concat([test,dummy], axis = 1)

dummy = pd.get_dummies(test.pickup_hour, prefix='pickup_hour')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #enlever la première colonne qui est l'index
test = pd.concat([test,dummy], axis = 1)

dummy = pd.get_dummies(test.passenger_count, prefix='passenger_count')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #enlever la première colonne qui est l'index
test = pd.concat([test,dummy], axis = 1)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
len(data[data['distance'] == 0])

In [None]:
test.head()

In [None]:
test.shape

In [None]:
test.describe()

In [None]:
len(test[test['distance'] == 0])

# Analyse univariée
#### On va analyser et étudier les variables une par une

In [None]:
pd.options.display.float_format = '{:.2f}'.format #Basculer l'affichage des floats en format scientifique

In [None]:
data.passenger_count.value_counts()

In [None]:
test.passenger_count.value_counts()

In [None]:
print(data.passenger_count.describe())
print(f'median = {data.passenger_count.median()}')
# On remarque que la moyenne, la médiane et les modes sont presque égaux à 1

In [None]:
# Alors on remplace le passenger_count 0 par 1
data['passenger_count'] = data.passenger_count.map(lambda x: 1 if x == 0 else x)
test['passenger_count'] = test.passenger_count.map(lambda x: 1 if x == 0 else x)


In [None]:
# On vire les valeurs des passenger_count qui sont supérieures à 7, 8, 9
data = data[data.passenger_count <= 6]
test = test[test.passenger_count <= 6]

In [None]:
data.passenger_count.value_counts()
test.passenger_count.value_counts()

In [None]:
#Nombre de courses par nombre de passagers
sns.countplot(data.passenger_count)
plt.show()

In [None]:
data.dtypes.reset_index()

In [None]:
# Distribution des horaires de départs des courses sur 24 heures
sns.countplot(data.pickup_hour)
plt.show()

# Séléction des features

In [None]:
#Vérifiez d'abord l'index des features et le label
list(zip( range(0,len(data.columns)),data.columns))

In [None]:
SELECTED_COLUMNS = ['passenger_count', 'distance']
X_two_features = data[SELECTED_COLUMNS]
X_two_features.head()
y_two_features = np.log1p(data['trip_duration'])


In [None]:
X_two_features.shape, y_two_features.shape

In [None]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_two_features, y_two_features)

In [None]:
cv_scores = -cross_val_score(rf, X_two_features, y_two_features, cv=3, scoring='neg_mean_squared_error')
cv_scores

In [None]:
np.mean(cv_scores)

In [None]:
X_test = test[SELECTED_COLUMNS]
predictions = np.exp(rf.predict(X_test))-np.ones(len(X_test))

X_test.shape
pred = pd.DataFrame(predictions, index=test['id'])
pred.columns = ['trip_duration']
pred.to_csv("submission_.csv")

pd.read_csv('submission_.csv').head()