### Task 1: Data Pipeline Development

Goal: Automate the ETL (Extract, Transform, Load) process using Pandas and Scikit-learn.

Steps:

Load data from a CSV, database, or API.

Clean and preprocess it (handle missing values, outliers, normalization).

Apply transformations like feature scaling or encoding.

Store the cleaned data in a database or a new file.

In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [3]:
df = pd.read_csv('Indian_Traffic_Violations.csv')
df.head(5)

Unnamed: 0,Violation_ID,Violation_Type,Fine_Amount,Location,Date,Time,Vehicle_Type,Vehicle_Color,Vehicle_Model_Year,Registration_State,...,Speed_Limit,Recorded_Speed,Alcohol_Level,Breathalyzer_Result,Towed,Fine_Paid,Payment_Method,Court_Appearance_Required,Previous_Violations,Comments
0,VLT100000,Overloading,4544,Karnataka,2023-01-01,23:02,Car,Red,2012,West Bengal,...,100,95,0.03,Negative,Yes,No,Online,Yes,3,Repeat Offender
1,VLT100001,Driving Without License,2776,Punjab,2023-01-02,00:42,Scooter,Silver,2010,Tamil Nadu,...,40,48,0.45,Negative,Yes,Yes,Online,No,2,Repeat Offender
2,VLT100002,Using Mobile Phone,4785,Maharashtra,2023-01-03,04:32,Scooter,Grey,2006,Tamil Nadu,...,80,26,0.31,Not Conducted,No,No,Not Paid,Yes,4,
3,VLT100003,No Seatbelt,1138,Uttar Pradesh,2023-01-04,15:06,Car,Green,1996,Uttar Pradesh,...,100,115,0.09,Not Conducted,No,Yes,Online,No,5,Repeat Offender
4,VLT100004,Over-speeding,1610,Karnataka,2023-01-05,06:57,Truck,Yellow,2016,Delhi,...,30,115,0.28,Positive,No,Yes,Cash,Yes,0,


In [4]:
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

df_processed = preprocessor.fit_transform(df)

processed_columns = (
    list(numeric_features) + 
    list(preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_features))
)

df_processed = pd.DataFrame(df_processed, columns=processed_columns)

df_processed.to_csv("processed_data.csv", index=False)

print("Data Preprocessing Completed and Saved Correctly!")


Data Preprocessing Completed and Saved Correctly!


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
df = pd.read_csv("processed_data.csv")
df.head(5)

Unnamed: 0,Fine_Amount,Vehicle_Model_Year,Driver_Age,Penalty_Points,Number_of_Passengers,Speed_Limit,Recorded_Speed,Alcohol_Level,Previous_Violations,Violation_ID_VLT100000,...,Fine_Paid_Yes,Payment_Method_Card,Payment_Method_Cash,Payment_Method_Not Paid,Payment_Method_Online,Court_Appearance_Required_No,Court_Appearance_Required_Yes,Comments_Fine Paid On Spot,Comments_First Violation,Comments_Repeat Offender
0,1.4212,0.370792,-1.284293,0.012512,0.708527,1.700464,0.868328,-1.540486,0.304884,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,0.173703,0.130758,-0.869202,-0.30424,0.708527,-0.838647,-0.730803,1.381746,-0.278626,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,1.591249,-0.349309,1.206256,0.962767,1.417408,0.854094,-1.479332,0.407669,0.888394,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,-0.982066,-1.549477,-0.039019,-0.620992,-0.709236,1.700464,1.548809,-1.123024,1.471905,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,-0.649024,0.850859,0.969061,-0.30424,0.708527,-1.261833,1.548809,0.198938,-1.445647,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [7]:
X = df.drop(columns=["Fine_Amount"]) 
y = df["Fine_Amount"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
model = LinearRegression()
model.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_test)


In [11]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [12]:
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²) Score: {r2}")

Mean Squared Error (MSE): 1.1614903764711961
R-squared (R²) Score: -0.13740601392611573


In [13]:
df.isnull().sum() 
df.describe()

Unnamed: 0,Fine_Amount,Vehicle_Model_Year,Driver_Age,Penalty_Points,Number_of_Passengers,Speed_Limit,Recorded_Speed,Alcohol_Level,Previous_Violations,Violation_ID_VLT100000,...,Fine_Paid_Yes,Payment_Method_Card,Payment_Method_Cash,Payment_Method_Not Paid,Payment_Method_Online,Court_Appearance_Required_No,Court_Appearance_Required_Yes,Comments_Fine Paid On Spot,Comments_First Violation,Comments_Repeat Offender
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,3.0198070000000005e-17,5.286438e-15,-8.171241000000001e-17,7.993606e-17,-5.595524000000001e-17,-1.232348e-16,2.1316280000000002e-17,-5.3734790000000006e-17,3.907985e-17,0.00025,...,0.493,0.2545,0.25825,0.2475,0.23975,0.49925,0.50075,0.5055,0.25175,0.24275
std,1.000125,1.000125,1.000125,1.000125,1.000125,1.000125,1.000125,1.000125,1.000125,0.015811,...,0.500014,0.435634,0.437727,0.431614,0.426984,0.500062,0.500062,0.500032,0.434073,0.428799
min,-1.714476,-1.669494,-1.699385,-1.571247,-1.418117,-1.261833,-1.683476,-1.749217,-1.445647,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.8813416,-0.8293762,-0.8692017,-0.9377437,-0.7092357,-0.8386474,-0.9009227,-0.8447164,-0.8621364,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.01680822,0.0107415,0.02028019,0.0125117,-0.0003544406,0.007723131,0.01772653,-0.009792956,-0.2786261,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
75%,0.84896,0.8508592,0.8504633,0.9627671,0.7085268,0.8540936,0.8683277,0.8947074,0.8883944,0.0,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
max,1.742952,1.690977,1.680646,1.596271,1.417408,1.700464,1.718929,1.729631,1.471905,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
