In [26]:
import pandas as pd

# Load a portion of the dataset to understand its structure
df = pd.read_csv('merged_dataset.csv')

# Display the first few rows of the dataframe to understand its structure
df.head()
df['pickup_borough'] = df['pickup_borough'].str.replace('Staten Island', 'StatenIsland')
df['dropoff_borough'] = df['dropoff_borough'].str.replace('Staten Island', 'StatenIsland')

In [27]:
import pandas as pd



# Discretize trip_distance into categories
df['trip_distance_category'] = pd.cut(df['trip_distance'], bins=[0, 3, 10, float('inf')], labels=['short', 'medium', 'long'])

# Update discretization for payment_type
df['payment_type_category'] = df['payment_type'].map({
    1: 'Credit_card',
    2: 'Cash',
    3: 'No_charge',
    4: 'Dispute'
}).fillna('Other')

# Convert the 'date' column to datetime to extract the day type (weekday/weekend)
df['date'] = pd.to_datetime(df['date'])
df['day_type'] = df['date'].dt.dayofweek.apply(lambda x: 'weekday' if x < 5 else 'weekend')

# Discretization for PRCP and SNOW
df['PRCP_category'] = pd.cut(df['PRCP'], bins=[-0.01, 0.01, 0.1, 0.5, float('inf')], labels=['no_rain', 'light_rain', 'moderate_rain', 'heavy_rain'])
df['SNOW_category'] = pd.cut(df['SNOW'], bins=[-0.01, 0.01, 1, 3, float('inf')], labels=['no_snow', 'light_snow', 'moderate_snow', 'heavy_snow'])

# Categorize Tavg (average temperature) into cold, mild, hot
df['Tavg_category'] = pd.cut(df['Tavg'], bins=[-float('inf'), 32, 60, float('inf')], labels=['cold', 'mild', 'hot'])

# Prefixing pickup and dropoff boroughs
df['pickup_borough'] = 'pickup_' + df['pickup_borough']
df['dropoff_borough'] = 'dropoff_' + df['dropoff_borough']

# Prepare the dataset for ARM by selecting the relevant columns
arm_data = df[['trip_distance_category', 'payment_type_category',
               'day_type', 'pickup_borough', 'dropoff_borough']]

# Convert the dataframe into a list of transactions (lists of items)
transactions = arm_data.apply(lambda x: x.dropna().tolist(), axis=1).tolist()

# Example: Display the first few transactions to verify the preparation
print(transactions[:5])

[['medium', 'Credit_card', 'weekday', 'pickup_Manhattan', 'dropoff_Manhattan'], ['short', 'Credit_card', 'weekend', 'pickup_Manhattan', 'dropoff_Manhattan'], ['short', 'Credit_card', 'weekday', 'pickup_Manhattan', 'dropoff_Manhattan'], ['long', 'Credit_card', 'weekday', 'pickup_Queens', 'dropoff_Manhattan'], ['medium', 'Credit_card', 'weekday', 'pickup_Queens', 'dropoff_Manhattan']]


In [24]:
arm_data.columns

Index(['trip_distance_category', 'payment_type_category', 'day_type',
       'PRCP_category', 'SNOW_category', 'pickup_borough', 'dropoff_borough',
       'Tavg_category'],
      dtype='object')

In [28]:
max_length = max(len(transaction) for transaction in transactions)

# Create a new DataFrame with a column for each possible item in the transactions
transactions_df = pd.DataFrame([transaction + [''] * (max_length - len(transaction)) for transaction in transactions])

# Save the DataFrame to a CSV, without the index and with a header indicating item positions
transactions_df.to_csv('taxi_ARM.csv', index=False, header=False)

