# Stage 03: Data cleaning and transformation

In this file we clean the data for a better future analysis. We divide this stage in steps:

1. Importing libraries
2. Loading the data
3. Data cleaning and transformation

### Step 1: importing libraries

In [32]:
import datetime
import numpy as np
import pandas as pd

### Step 2: loading the data


In [33]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2018-01.parquet')

### Step 3: data cleaning and transformation

#### a) Deleting innecesary columns

In [34]:
df.drop(columns=['congestion_surcharge','airport_fee'], inplace=True)

#### b) Deleting duplicate values

In [35]:
df.drop_duplicates(inplace=True)

#### c) Column renaming

In [36]:
df.rename(columns =
                    {'VendorID':'id_vendor',
                    'RatecodeID':'id_ratecode',
                    'PULocationID':'id_pu_zone',
                    'DOLocationID':'id_do_zone'}, inplace = True)

#### d) We created a new column, fare_per_mile, to study the relation between fare_amount and trip_distance

In [37]:
df['trip_distance_aux'] = df['trip_distance']
df['trip_distance_aux'].replace(0, 1, inplace=True)
df['trip_distance_aux'].fillna(1, inplace=True)

df['fare_per_mile'] = df.fare_amount / df.trip_distance_aux

df.loc[df['trip_distance'] == 0, 'fare_per_mile'] = 0

df.drop(columns=['trip_distance_aux'], inplace=True)

#### e) We created a new column, trip_time, to identify the trip time in seconds.

First, we calculate the time difference

In [38]:
df['trip_time'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime

Then, we converted it to seconds

In [39]:
df.trip_time = df.trip_time.dt.total_seconds()

#### f) We created a new column, fare_per_minute, for indetify the relation between fare_amount and el trip_time

In [40]:
df['trip_time_aux'] = df['trip_time']
df['trip_time_aux'].replace(0, 1, inplace=True)
df['trip_time_aux'].fillna(1, inplace=True)

df['fare_per_minute'] = df.fare_amount / (df.trip_time_aux / 60)

df.loc[df['trip_time'] == 0, 'fare_per_minute'] = 0

df.drop(columns=['trip_time_aux'], inplace=True)

#### g) Starting to work with zones dataset and boroughs dataset

First, we load the taxi zones dataset

In [41]:
df_zone = pd.read_csv('https://raw.githubusercontent.com/soyHenry/DS-Proyecto_Grupal_TaxisNYC/main/taxi%2B_zone_lookup.csv')

Columns renaming

In [42]:
df_zone.rename(columns =
                    {'LocationID':'id_zone',
                    'Borough':'borough',
                    'Zone':'zone'}, inplace = True)

Creating the boroughs dataframe

In [43]:
borough_data = {'id_borough': [1, 2, 3, 4, 5, 6, 7], 'borough':['Brooklyn', 'Bronx', 'Manhattan', 'Staten Island', 'Queens', 'EWR', 'Unknown']}
df_borough = pd.DataFrame(borough_data)

Creating a zones dictionary with their respective borough so then we can map

In [44]:
dic_zone_borough = {df_zone.id_zone[i] : df_zone.borough[i] for i in range (0,len(df_zone))}

Creating a borough dictionary with their respective id so then we can map

In [45]:
dic_id_borough = {df_borough.borough[i] : df_borough.id_borough[i] for i in range (0, len(df_borough))}

Creating two new columns: pu_borough: "pull up borough" and do_borough: "drop off borough"

In [46]:
df['pu_borough'] = df.id_pu_zone.map(dic_zone_borough)
df['do_borough'] = df.id_do_zone.map(dic_zone_borough)

We create an id_borough in the taxis dataframe so then we can do the relationship in SQL

In [47]:
df['id_borough'] = df.pu_borough.map(dic_id_borough)

#### h) We create a new column, id_time_borough, for then do the relationship in SQL with the weather table

In [48]:
df['id_time_borough'] = df.tpep_pickup_datetime.dt.strftime('%Y%m%d%H') + df.id_borough.astype(str)

#### i) Indetifying outliers

Creating outliers column

In [49]:
df['outlier'] = 1

Outliers trip_distance

In [50]:
# Calculating interqualtile range, min, max
IQR = df.trip_distance.quantile(.75) - df.trip_distance.quantile(.25)
min = df.trip_distance.quantile(.25) - (1.5 * IQR)
max = df.trip_distance.quantile(.75) + (1.5 * IQR)

# Identifying outliers
df.loc[df.trip_distance < min, "outlier"] = 0
df.loc[df.trip_distance > max, "outlier"] = 0

Outliers fare_amount

In [51]:
# Calculating interqualtile range, min, max
IQR = df.fare_amount.quantile(.75) - df.fare_amount.quantile(.25)
min = df.fare_amount.quantile(.25) - (1.5 * IQR)
max = df.fare_amount.quantile(.75) + (1.5 * IQR)

# Identifying outliers
df.loc[df.fare_amount < min, "outlier"] = 0
df.loc[df.fare_amount > max, "outlier"] = 0

Outliers trip_time

In [52]:
# Calculating interqualtile range, min, max
IQR = df.trip_time.quantile(.75) - df.trip_time.quantile(.25)
min = df.trip_time.quantile(.25) - (1.5 * IQR)
max = df.trip_time.quantile(.75) + (1.5 * IQR)

# Identifying outliers
df.loc[df.trip_time < min, "outlier"] = 0
df.loc[df.trip_time > max, "outlier"] = 0

In [53]:
df['id_trip'] = df.index.values
df['id_trip'] = df['id_trip'] + 1

In [54]:
cols = list(df.columns)
df = df[cols[25:26] + cols[0:25]]

In [None]:
df['datetime_snow'] = df.tpep_pickup_datetime.dt.strftime('%Y%m%d')

In [59]:
df.to_csv('taxi.csv', index=False)
df_zone.to_csv('zone.csv', index=False)
df_borough.to_csv('borough.csv', index=False)