In [1]:
import pandas as pd
import numpy as np


In [3]:
# Load the dirty dataset
file_path = "indian_election_dataset.csv"
df = pd.read_csv(file_path)

In [4]:
# 1. Handling NULL Values
print("Missing values before:")
print(df.isnull().sum())
df.fillna({'pc_type': 'Unknown', 'cand_sex': 'Unknown'}, inplace=True)
df.dropna(inplace=True)  # Dropping remaining rows with NULLs
print("Missing values after:")
print(df.isnull().sum())

Missing values before:
st_name        3587
year           3686
pc_no          3590
pc_name        3538
pc_type       11366
cand_name      3613
cand_sex       4194
partyname      3576
partyabbre     3563
totvotpoll     3573
electors       3744
dtype: int64
Missing values after:
st_name       0
year          0
pc_no         0
pc_name       0
pc_type       0
cand_name     0
cand_sex      0
partyname     0
partyabbre    0
totvotpoll    0
electors      0
dtype: int64


In [5]:
# 2. Removing Duplicates
print("Duplicates before:", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Duplicates after:", df.duplicated().sum())


Duplicates before: 70
Duplicates after: 0


In [6]:
# 3. Standardizing Data Formats
df['partyname'] = df['partyname'].str.replace("_", " ").str.title()
df['pc_name'] = df['pc_name'].str.title()
df['st_name'] = df['st_name'].str.title()

In [7]:
# 4. Correcting Inconsistent Data
df['cand_sex'] = df['cand_sex'].replace({
    'M': 'Male', 'MALE': 'Male', 'F': 'Female', 'FEMALE': 'Female'
})

In [8]:
# 5. Data Type Conversion
df['totvotpoll'] = pd.to_numeric(df['totvotpoll'], errors='coerce')
df['electors'] = pd.to_numeric(df['electors'], errors='coerce')

In [9]:
# Final Check
print("Final Data Types:")
print(df.dtypes)
print("Cleaned Dataset Preview:")
print(df.head())


Final Data Types:
st_name        object
year          float64
pc_no         float64
pc_name        object
pc_type        object
cand_name      object
cand_sex       object
partyname      object
partyabbre     object
totvotpoll    float64
electors      float64
dtype: object
Cleaned Dataset Preview:
                      st_name    year  pc_no                    pc_name  \
1   Andaman & Nicobar Islands  1977.0    1.0  Andaman & Nicobar Islands   
4   Andaman & Nicobar Islands  1980.0    1.0  Andaman & Nicobar Islands   
6   Andaman & Nicobar Islands  1980.0    1.0  Andaman & Nicobar Islands   
7   Andaman & Nicobar Islands  1980.0    1.0  Andaman & Nicobar Islands   
10  Andaman & Nicobar Islands  1980.0    1.0  Andaman & Nicobar Islands   

   pc_type           cand_name cand_sex                           partyname  \
1      GEN   Manoranjan Bhakta     Male            Indian National Congress   
4      GEN         Kannu Chemy     Male                        Independents   
6      GEN  R

In [10]:
# Save the cleaned dataset
cleaned_file_path = "indian_election_cleaned.csv"
df.to_csv(cleaned_file_path, index=False)