In [0]:
# Inferential Statistics vs Big Data Analytics
## NYC Yellow Taxi Trips (2022â€“2024)

from pyspark.sql.functions import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import ttest_ind


### Charger le dataset SAMPLE & EDA

### 

In [0]:
sample_path = "/Volumes/workspace/default/filestore/yellowtaxisample1pct_hybrid_stratified.csv"

df_sample = pd.read_csv(sample_path)
df_sample.shape


In [0]:
df_sample.head()

In [0]:
df_sample.isnull().sum()

Analyse des valeurs existantes

In [0]:
cols_focus = [
    "passenger_count",
    "RatecodeID",
    "store_and_fwd_flag",
    "congestion_surcharge",
    "airport_fee",
]

for col in cols_focus:
    print(f"\n* {col}")
    print(df_sample[col].value_counts())

In [0]:
(df_sample["tpep_pickup_datetime"] > df_sample["tpep_dropoff_datetime"]).sum()


### Nettoyage Sample dataset

In [0]:
# 1/ passenger_count : remplir par la mÃ©diane
median_passenger = df_sample["passenger_count"].median()

df_sample["passenger_count"] = df_sample["passenger_count"].fillna(median_passenger)

# 2/RatecodeID : remplir par la valeur la plus frÃ©quente (mode
mode_ratecode = df_sample["RatecodeID"].mode()[0]

df_sample["RatecodeID"] = df_sample["RatecodeID"].fillna(mode_ratecode)

# 3/ store_and_fwd_flag : remplir par "N"
df_sample["store_and_fwd_flag"] = df_sample["store_and_fwd_flag"].fillna("N")

# 4/ congestion_surcharge : remplir par 0
df_sample["congestion_surcharge"] = df_sample["congestion_surcharge"].fillna(0)

# 5/ Fusionner Airport_fee dans airport_fee
df_sample["airport_fee"] = (
    df_sample["airport_fee"]
    .fillna(df_sample["Airport_fee"])
    .fillna(0)
)

# Supprimer la colonne en double
df_sample = df_sample.drop(columns=["Airport_fee"])

# Convertir les colonnes datetime en datetime
df_sample["tpep_pickup_datetime"] = pd.to_datetime(df_sample["tpep_pickup_datetime"])
df_sample["tpep_dropoff_datetime"] = pd.to_datetime(df_sample["tpep_dropoff_datetime"])


mask = df_sample["tpep_pickup_datetime"] > df_sample["tpep_dropoff_datetime"]
df_sample.loc[mask, ["tpep_pickup_datetime", "tpep_dropoff_datetime"]] = (
df_sample.loc[mask, ["tpep_dropoff_datetime", "tpep_pickup_datetime"]].values
)



In [0]:
df_sample.isnull().sum()


### statistiques infÃ©rentielles

In [0]:
# Prix moyen dâ€™une course
fare = df_sample["fare_amount"].dropna()

mean_fare = fare.mean()
std_fare = fare.std(ddof=1)
n = len(fare)

z = stats.norm.ppf(0.975)  # IC 95 %
margin = z * (std_fare / np.sqrt(n))

ci_fare = (mean_fare - margin, mean_fare + margin)

mean_fare, ci_fare

In [0]:
# Distance moyenne
distance = df_sample["trip_distance"].dropna()

mean_dist = distance.mean()
std_dist = distance.std(ddof=1)
n = len(distance)

margin = z * (std_dist / np.sqrt(n))
ci_dist = (mean_dist - margin, mean_dist + margin)

mean_dist, ci_dist


In [0]:
# DurÃ©e moyenne des courses
df_sample["duration_min"] = (
    df_sample["tpep_dropoff_datetime"] - df_sample["tpep_pickup_datetime"]
).dt.total_seconds() / 60

duration = df_sample["duration_min"].dropna()

mean_dur = duration.mean()
std_dur = duration.std(ddof=1)
n = len(duration)

margin = z * (std_dur / np.sqrt(n))
ci_dur = (mean_dur - margin, mean_dur + margin)

mean_dur, (mean_dur - margin, mean_dur + margin)


In [0]:
# Proportion des courses avec tip > 0

tip = (df_sample["tip_amount"] > 0)

p_hat = tip.mean()
n = tip.count()

margin = z * np.sqrt((p_hat * (1 - p_hat)) / n)
ci_tip = (p_hat - margin, p_hat + margin)

p_hat, ci_tip


In [0]:
# Conversion datetime
df_sample["tpep_pickup_datetime"] = pd.to_datetime(df_sample["tpep_pickup_datetime"])

# Variables temporelles
df_sample["hour"] = df_sample["tpep_pickup_datetime"].dt.hour
df_sample["day_of_week"] = df_sample["tpep_pickup_datetime"].dt.day_name()
df_sample["week"] = df_sample["tpep_pickup_datetime"].dt.isocalendar().week

In [0]:
n = len(df_sample)

hour_counts = df_sample["hour"].value_counts().sort_index()
hour_prop = hour_counts / n

In [0]:
z = 1.96

hour_ci = pd.DataFrame({
    "proportion": hour_prop,
    "ci_low": hour_prop - z * np.sqrt((hour_prop * (1 - hour_prop)) / n),
    "ci_high": hour_prop + z * np.sqrt((hour_prop * (1 - hour_prop)) / n)
})

hour_ci.head()


In [0]:
hour_dist = df_sample.groupby("hour").size()

plt.figure(figsize=(10,4))
hour_dist.plot(kind="bar")
plt.title("Distribution des courses par heure")
plt.xlabel("Heure de la journÃ©e")
plt.ylabel("Nombre de courses")
plt.xticks(rotation=0)
plt.grid(axis="y", alpha=0.3)
plt.show()


In [0]:
day_order = [
    "Monday", "Tuesday", "Wednesday",
    "Thursday", "Friday", "Saturday", "Sunday"
]

day_dist = (
    df_sample.groupby("day_of_week")
    .size()
    .reindex(day_order)
)

plt.figure(figsize=(8,4))
day_dist.plot(kind="bar")
plt.title("Distribution des courses par jour de la semaine")
plt.xlabel("Jour")
plt.ylabel("Nombre de courses")
plt.xticks(rotation=30)
plt.grid(axis="y", alpha=0.3)
plt.show()


In [0]:
week_dist = df_sample.groupby("week").size()

plt.figure(figsize=(10,4))
week_dist.plot()
plt.title("Distribution des courses par semaine")
plt.xlabel("Semaine de l'annÃ©e")
plt.ylabel("Nombre de courses")
plt.grid(alpha=0.3)
plt.show()


In [0]:
prix_par_zone_depart = (
    df_sample.groupby("PULocationID")["fare_amount"]
    .mean()
    .sort_values(ascending=False)
)
top_zones = prix_par_zone_depart.head(10)

plt.figure()
plt.bar(top_zones.index.astype(str), top_zones.values)
plt.xlabel("Pickup LocationID")
plt.ylabel("Prix moyen")
plt.title("Top 10 zones (pickup) avec les fares les plus Ã©levÃ©s")
plt.show()

In [0]:
prix_par_zone_depart = (
    df_sample.groupby("DOLocationID")["fare_amount"]
    .mean()
    .sort_values(ascending=False)
)
top_zones = prix_par_zone_depart.head(10)

plt.figure()
plt.bar(top_zones.index.astype(str), top_zones.values)
plt.xlabel("dropoff DOLocationID")
plt.ylabel("Prix moyen")
plt.title("Top 10 zones (dropoff) avec les fares les plus Ã©levÃ©s")
plt.show()

In [0]:
# Distribution gÃ©ographique (proportions)
zone_dist = df_sample["PULocationID"].value_counts(normalize=True).head(10)

plt.figure(figsize=(10,4))
zone_dist.plot(kind="bar")
plt.title("Top 10 des zones de dÃ©part (PULocationID) â€“ Sample")
plt.xlabel("PULocationID")
plt.ylabel("Proportion des courses")
plt.xticks(rotation=45)
plt.grid(axis="y", alpha=0.3)
plt.show()

In [0]:
# Outliers
# filtrer les courses > 200
df_expensive = df_sample[df_sample["fare_amount"] > 200]
data = df_expensive["fare_amount"].dropna()

mean = data.mean()
std = data.std()
n = len(data)

z = 1.96  # IC Ã  95%
margin = z * (std / np.sqrt(n))

ic_low = mean - margin
ic_high = mean + margin

mean, ic_low, ic_high


In [0]:
# filtrer les longues courses
df_long = df_sample[df_sample["trip_distance"] > 30]

data = df_long["trip_distance"].dropna()

mean = data.mean()
std = data.std()
n = len(data)

z = 1.96
margin = z * (std / np.sqrt(n))

ic_low = mean - margin
ic_high = mean + margin

mean, ic_low, ic_high

In [0]:
# Tip / Fare

df_sample["tip_ratio"] = df_sample["tip_amount"] / df_sample["fare_amount"]

tip_ratio_mean = df_sample["tip_ratio"].dropna().mean()
tip_ratio_mean


In [0]:
# Ratio moyen tip/fare par type de paiement

df_pay = df_sample[
    (df_sample["payment_type"].isin([1, 2])) &
    (df_sample["fare_amount"] > 0)
].copy()

df_pay["tip_ratio"] = df_pay["tip_amount"] / df_pay["fare_amount"]

tip_ratio_by_payment = df_pay.groupby("payment_type")["tip_ratio"].mean()


tip_ratio_by_payment.rename({
    1: "Credit Card",
    2: "Cash"
})




In [0]:

card_tips = df_pay[df_pay["payment_type"] == 1]["tip_ratio"].dropna()
cash_tips = df_pay[df_pay["payment_type"] == 2]["tip_ratio"].dropna()

t_stat, p_value = ttest_ind(card_tips, cash_tips, equal_var=False)

t_stat, p_value


### Charger le dataset POPULATION

In [0]:
pop_path = "/Volumes/workspace/default/filestore/Data_as_paquets/"

df_pop = spark.read.parquet(pop_path)

df_pop.printSchema()
df_pop.count()



### **ðŸ“Œ Description des colonnes â€“ NYC Yellow Taxi**
### 
**VendorID**
Identifiant du fournisseur de taxi **(ex : entreprise ou systÃ¨me de dispatch)**.

**tpep_pickup_datetime**
Date et heure de dÃ©but de la course **(prise en charge du client)**.

**tpep_dropoff_datetime**
Date et heure de fin de la course.

**passenger_count**
Nombre de passagers dans le taxi.

**trip_distance**
Distance de la course en miles.

**RatecodeID**
Code tarifaire appliquÃ© Ã  la course **(tarif standard, aÃ©roport, hors ville, etc.)**.

**store_and_fwd_flag**
Indique si les donnÃ©es de la course ont Ã©tÃ© stockÃ©es temporairement avant transmission
**(Y = oui, N = non)**.

**PULocationID**
Identifiant de la zone de prise en charge **(Pickup Location)**.

**DOLocationID**
Identifiant de la zone de dÃ©pose **(Dropoff Location)**.

**payment_type**
MÃ©thode de paiement utilisÃ©e :

1 â†’ Carte

2 â†’ Cash

Autres â†’ non standard

**fare_amount**
Montant de base de la course **(hors taxes et supplÃ©ments)**.

**extra**
SupplÃ©ments **(heures de nuit, heures de pointe, etc.)**.

**mta_tax**
Taxe MTA **(taxe fixe de transport Ã  NYC)**.

**tip_amount**
Montant du pourboire.

**tolls_amount**
Frais de pÃ©age.

**improvement_surcharge**
SupplÃ©ment rÃ©glementaire pour amÃ©lioration du service.

**congestion_surcharge**
SupplÃ©ment liÃ© Ã  la congestion dans certaines zones.

**airport_fee**
Frais supplÃ©mentaires pour les trajets vers/depuis les aÃ©roports.

**total_amount**
Montant total payÃ© par le client **(tous frais inclus)**.

In [0]:
files = dbutils.fs.ls(pop_path)

df_list = []
for f in files:
    df_list.append(spark.read.parquet(f.path))

df_pop = df_list[0]
for df in df_list[1:]:
    df_pop = df_pop.unionByName(df, allowMissingColumns=True)

df_pop.count()


In [0]:
# Calcul des NULL
null_summary = df_pop.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in df_pop.columns
])

# RÃ©cupÃ©rer les rÃ©sultats en dictionnaire
null_counts = null_summary.first().asDict()

# Garder uniquement les colonnes avec NULL
cols_with_nulls = [c for c, v in null_counts.items() if v > 0]

### les colonnes avec des NULL

In [0]:
null_summary.select(cols_with_nulls).show(truncate=False)
