# Cassandra

Dataset : taxi_trip_data.csv

## Connection

In [33]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import json

# This secure connect bundle is autogenerated when you download your SCB,
# if yours is different update the file name below
cloud_config= {
  'secure_connect_bundle': './secure-connect-taxi-trip.zip'
}

# This token JSON file is autogenerated when you download your token,
# if yours is different update the file name below
with open("./taxi_trip-token.json") as f:
    secrets = json.load(f)

CLIENT_ID = secrets["clientId"]
CLIENT_SECRET = secrets["secret"]

auth_provider = PlainTextAuthProvider(CLIENT_ID, CLIENT_SECRET)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect()

row = session.execute("select release_version from system.local").one()
if row:
  print(row[0])
else:
  print("An error occurred.")

4.0.0.6816


## Import and Read

In [34]:
import pandas as pd

data = pd.read_csv(
    "/Users/yahiaehab/University/Projects/BigData&NoSQL/Assignment 1/datasets/taxi_trip_data.csv" , nrows= 50_000
)

## Data Preprocessing

### Drop Unnecessary Columns

In [35]:
data.drop(columns=["store_and_fwd_flag", "rate_code", "total_amount"], inplace=True)

### Drop Missing Values

In [36]:
data.dropna(
    subset=[
        "vendor_id",
        "pickup_datetime",
        "dropoff_datetime",
        "passenger_count",
        "trip_distance",
        "payment_type",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "imp_surcharge",
        "pickup_location_id",
        "dropoff_location_id",
    ],
    inplace=True,
)

## Calculations

### Trip Duration

In [37]:
# Calculate trip duration
data["pickup_datetime"] = pd.to_datetime(data["pickup_datetime"])
data["dropoff_datetime"] = pd.to_datetime(data["dropoff_datetime"])
data["trip_duration"] = (
    data["dropoff_datetime"] - data["pickup_datetime"]
).dt.total_seconds() / 60  # in minutes

### Total Trip Cost

In [38]:
data["total_trip_cost"] = (
    data["fare_amount"]
    + data["extra"]
    + data["mta_tax"]
    + data["tip_amount"]
    + data["tolls_amount"]
    + data["imp_surcharge"]
)

## Cassandra Work

### Initiation

In [39]:
session.set_keyspace("taxi_trip_data")
row = cluster.metadata.keyspaces["taxi_trip_data"]

from pprint import pprint
pprint(row)

<cassandra.metadata.KeyspaceMetadata object at 0x110263910>


### Table Creation

In [40]:
session.execute(
    """
    CREATE TABLE IF NOT EXISTS trips (
        vendor_id text,
        pickup_datetime timestamp,
        dropoff_datetime timestamp,
        passenger_count int,
        trip_distance float,
        payment_type text,
        fare_amount float,
        extra float,
        mta_tax float,
        tip_amount float,
        tolls_amount float,
        imp_surcharge float,
        pickup_location_id text,
        dropoff_location_id text,
        trip_duration float,
        total_trip_cost float,
        PRIMARY KEY (vendor_id, pickup_datetime)
    )
"""
)

<cassandra.cluster.ResultSet at 0x10f67b340>

### Insertion

In [41]:
data.columns

Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'payment_type', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'imp_surcharge', 'pickup_location_id',
       'dropoff_location_id', 'trip_duration', 'total_trip_cost'],
      dtype='object')

In [43]:
import csv

row_limit = 20_000
row_count = 0

with open(
    "/Users/yahiaehab/University/Projects/BigData&NoSQL/Assignment 1/datasets/taxi_trip_data.csv",
    "r",
) as f:
    reader = csv.reader(f)  # Create a reader object.
    next(reader)  # Skip the header row.
    for row in reader:
        if row_count >= row_limit:
            break

        vendor_id = row[0].replace("'", "''")
        pickup_datetime = row[1]
        dropoff_datetime = row[2]
        passenger_count = int(row[3])
        trip_distance = float(row[4])
        payment_type = row[5].replace("'", "''")

        try:
            fare_amount = float(row[6])
        except ValueError:
            fare_amount = 0.0

        extra = float(row[7])
        mta_tax = float(row[8])
        tip_amount = float(row[9])
        tolls_amount = float(row[10])
        imp_surcharge = float(row[11])
        pickup_location_id = row[12].replace("'", "''")
        dropoff_location_id = row[13].replace("'", "''")
        trip_duration = float(row[14])
        total_trip_cost = float(row[15])

        query = (
            "INSERT INTO trips (vendor_id, pickup_datetime, dropoff_datetime, passenger_count, trip_distance, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, imp_surcharge, pickup_location_id, dropoff_location_id, trip_duration, total_trip_cost) "
            f"VALUES ('{vendor_id}', '{pickup_datetime}', '{dropoff_datetime}', {passenger_count}, {trip_distance}, '{payment_type}', {fare_amount}, {extra}, {mta_tax}, {tip_amount}, {tolls_amount}, {imp_surcharge}, '{pickup_location_id}', '{dropoff_location_id}', {trip_duration}, {total_trip_cost})"
        )

        try:
            session.execute(query)
        except Exception as e:
            print("Error executing query:", e)

        row_count += 1

Error executing query: errors={'01ac88f6-b79d-4938-ad9d-5597469bea97-us-east1.db.astra.datastax.com:29042:8e918b35-33a5-411a-b5ee-405168430ad2': 'Client request timeout. See Session.execute[_async](timeout)'}, last_host=01ac88f6-b79d-4938-ad9d-5597469bea97-us-east1.db.astra.datastax.com:29042:8e918b35-33a5-411a-b5ee-405168430ad2
Error executing query: errors={'01ac88f6-b79d-4938-ad9d-5597469bea97-us-east1.db.astra.datastax.com:29042:c39a5bc8-1a0c-4e01-ab3a-883b2afa0063': 'Client request timeout. See Session.execute[_async](timeout)'}, last_host=01ac88f6-b79d-4938-ad9d-5597469bea97-us-east1.db.astra.datastax.com:29042:c39a5bc8-1a0c-4e01-ab3a-883b2afa0063
Error executing query: errors={'01ac88f6-b79d-4938-ad9d-5597469bea97-us-east1.db.astra.datastax.com:29042:8e918b35-33a5-411a-b5ee-405168430ad2': 'Client request timeout. See Session.execute[_async](timeout)'}, last_host=01ac88f6-b79d-4938-ad9d-5597469bea97-us-east1.db.astra.datastax.com:29042:8e918b35-33a5-411a-b5ee-405168430ad2
Error e

### Closing Connection

In [44]:
# Close the connection
cluster.shutdown()