### Installing Luigi

In [1]:
#pip install luigi

### Importing Libraries

In [2]:
import luigi
import numpy as np
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import mysql.connector as mysql
from sqlalchemy import create_engine

### Define the connection parameters for MySQL and MongoDB

In [3]:
MYSQL_HOST = 'localhost'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'sana123'
MYSQL_DB = 'montgomery'

In [4]:
mongo_uri = "mongodb+srv://x22237941:sana123@montgomerycluster.tzxvtsd.mongodb.net/?retryWrites=true&w=majority&appName=montgomerycluster"

### Importing API librariers

In [5]:
import os
from sodapy import Socrata

### Define the task to extract data from the Socrata API

In [7]:
class ExtractSocrataDataJSON(luigi.Task):
    def output(self):
        return luigi.LocalTarget("incidents.json")
    
    def run(self):
        socrata_domain = 'data.montgomerycountymd.gov'
        socrata_dataset_identifier_incidents = 'bhju-22kf'
        socrata_token = os.environ.get("SODAPY_APPTOKEN")
        client = Socrata(socrata_domain, socrata_token)
        results = client.get(socrata_dataset_identifier_incidents)
        df = pd.DataFrame.from_dict(results)
        incidents_data = df.to_json(orient='records')
        with self.output().open('w') as f:
            f.write(incidents_data)

In [8]:
class ExtractSocrataDataCSV(luigi.Task):
    def output(self):
        return luigi.LocalTarget("incidents.csv")  # Output CSV file
    
    def run(self):
        socrata_domain = 'data.montgomerycountymd.gov'
        socrata_dataset_identifier_incidents = 'bhju-22kf'
        socrata_token = os.environ.get("SODAPY_APPTOKEN")
        client = Socrata(socrata_domain, socrata_token)
        results = client.get(socrata_dataset_identifier_incidents)
        df = pd.DataFrame.from_dict(results)
        df.to_csv(self.output().path, index=False)  # Save data to CSV file

### Define the task to load data into MySQL

In [11]:
class LoadMySQLData(luigi.Task):
    def requires(self):
        return ExtractSocrataDataCSV()
    
    def run(self):
        # Define the MySQL connection parameters
        host = 'localhost'
        user = 'root'
        password = 'sana123'
        database = 'montgomery2'
        
        # Define the SQL queries
        create_database_query = f"CREATE DATABASE IF NOT EXISTS {database}"
        use_database_query = f"USE {database}"
        create_table_query = '''CREATE TABLE IF NOT EXISTS incidents (
                                report_number TEXT,
                                local_case_number TEXT,
                                agency_name TEXT,
                                acrs_report_type TEXT,
                                crash_date_time TEXT,
                                hit_run TEXT,
                                route_type TEXT,
                                mile_point TEXT,
                                mile_point_direction TEXT,
                                lane_direction TEXT,
                                lane_number TEXT,
                                lane_type TEXT,
                                number_of_lanes TEXT,
                                direction TEXT,
                                distance TEXT,
                                distance_unit TEXT,
                                road_grade TEXT,
                                non_traffic TEXT,
                                road_name TEXT,
                                cross_street_type TEXT,
                                cross_street_name TEXT,
                                off_road_description TEXT,
                                municipality TEXT,
                                related_non_motorist TEXT,
                                at_fault TEXT,
                                collision_type TEXT,
                                weather TEXT,
                                surface_condition TEXT,
                                light TEXT,
                                traffic_control TEXT,
                                driver_substance_abuse TEXT,
                                non_motorist_substance_abuse TEXT,
                                first_harmful_event TEXT,
                                second_harmful_event TEXT,
                                fixed_object_struck TEXT,
                                junction TEXT,
                                intersection_type TEXT,
                                intersection_area TEXT,
                                road_alignment TEXT,
                                road_condition TEXT,
                                road_division TEXT,
                                latitude TEXT,
                                longitude TEXT,
                                location TEXT
                                )'''
        show_table_query = "SHOW TABLES"
        drop_columns_query = '''ALTER TABLE incidents
                                DROP COLUMN Latitude,
                                DROP COLUMN Longitude,
                                DROP COLUMN Location'''
        insert_data_query = '''INSERT INTO incidents (report_number, local_case_number, agency_name, acrs_report_type, crash_date_time, hit_run, route_type, mile_point, mile_point_direction, lane_direction, lane_number,lane_type, number_of_lanes, direction, distance, distance_unit, road_grade, non_traffic, road_name, cross_street_type, cross_street_name, off_road_description, municipality, related_non_motorist, at_fault, collision_type, weather, surface_condition, light, traffic_control, driver_substance_abuse, non_motorist_substance_abuse, first_harmful_event, second_harmful_event, fixed_object_struck, junction, intersection_type, intersection_area, road_alignment, road_condition, road_division,latitude,longitude,location) VALUES (%s,%s,%s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''

        # Connect to MySQL database
        conn = mysql.connect(host=host, user=user, password=password)
        cursor = conn.cursor()

        # Create database if it does not exist
        cursor.execute(create_database_query)

        # Use the specified database
        cursor.execute(use_database_query)

        # Create table if it does not exist
        cursor.execute(create_table_query)

        # Show tables in the database
        cursor.execute(show_table_query)
        tables = cursor.fetchall()
        for table in tables:
            print(table[0])

        # Drop unnecessary columns
        cursor.execute(drop_columns_query)

        # Show columns in the table
        cursor.execute("DESCRIBE incidents")
        columns = cursor.fetchall()
        for column in columns:
            print(column[0], "-", column[1])

        # Insert data into the table
        df = pd.read_csv(self.input().path)
        df_filled = df.fillna("Missing")
        inserted_records_count = 0
        for index, row in df_filled.iterrows():
            cursor.execute(insert_data_query, tuple(row))
            inserted_records_count += 1
        conn.commit()
        print("Number of records inserted into the incidents table:", inserted_records_count)

        #Check the count of inserted data
        cursor.execute("SELECT COUNT(*) FROM incidents")
        count = cursor.fetchone()[0]
        print("Number of records in 'incidents' table:", count)

        cursor.close()
        conn.close()

### Define the transformation task using Pandas

In [12]:
class TransformData(luigi.Task):
    def requires(self):
        return ExtractMySQLData()
    
    def output(self):
        return luigi.LocalTarget("transformed_data.json")  # Output file
    
    def run(self):
        # Define the MySQL connection parameters
        host = 'localhost'
        user = 'root'
        password = 'sana123'
        database = 'montgomery2'

        # Define the SQL query to read data
        read_data_query = "SELECT * FROM incident_dupe"

        # Connect to MySQL database
        conn = mysql.connect(host=host, user=user, password=password, database=database)

        # Read data from MySQL into a DataFrame
        sql_frame = pd.read_sql(read_data_query, conn)

        # Checking for duplicate records
        duplicate_rows = sql_frame.duplicated()
        print("Number of duplicate rows:", duplicate_rows.sum())
        print("--------------------------------------------------")
        # Checking Missing Values
        missing_values = sql_frame.isnull().sum()
        print("Columns with missing values:")
        print(missing_values)
        print("--------------------------------------------------")

        # Handle 'Missing' values
        sql_frame['hit_run'] = sql_frame['hit_run'].replace('Missing', 'unknown')
        sql_frame['direction'] = sql_frame['direction'].replace('Missing', 'Unknown')
        sql_frame['mile_point'] = sql_frame['mile_point'].replace('Missing', np.nan).astype(float)
        sql_frame['mile_point'].fillna(sql_frame['mile_point'].median(), inplace=True)
        sql_frame['distance'] = sql_frame['distance'].replace('Missing', np.nan).astype(float)
        sql_frame['distance'].fillna(sql_frame['distance'].median(), inplace=True)
        sql_frame['lane_direction'] = sql_frame['lane_direction'].replace('Missing', sql_frame['lane_direction'].mode()[0])

        # Drop specified columns
        columns_to_drop = ['cross_street_type', 'off_road_description', 'municipality', 'first_harmful_event', 'second_harmful_event', 
                           'mile_point_direction', 'road_grade', 'non_traffic', 'fixed_object_struck', 'intersection_area', 'road_division']
        sql_frame.drop(columns=columns_to_drop, inplace=True)

        # Filter out columns with too many 'Missing' values
        threshold = 50000
        columns_to_drop = [column for column, count in missing_values.items() if count > threshold]
        sql_frame.drop(columns=columns_to_drop, inplace=True)

        # Print the remaining columns
        print("Remaining columns after dropping:")
        print(sql_frame.columns.tolist())
        print("--------------------------------------------------")

        # Save the transformed DataFrame to JSON
        filename = "transformed_data.json"
        sql_frame.to_json(filename, orient='records')

        # Close MySQL connection
        conn.close()