In [1]:
import kagglehub
import os
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download the dataset
path = kagglehub.dataset_download("fahmidachowdhury/domestic-violence-against-women")

# List files in the directory to find the CSV
csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]

if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {path}")

# Construct path to the CSV file
csv_path = os.path.join(path, csv_files[0])

# Load the CSV into a DataFrame
dataframe = pd.read_csv(csv_path)

In [3]:
print(dataframe.head())

   SL. No  Age Education  Employment   Income Marital status  Violence 
0       1   30  secondary  unemployed       0         married       yes
1       2   47   tertiary  unemployed       0         married        no
2       3   24   tertiary  unemployed       0        unmarred        no
3       4   22   tertiary  unemployed       0        unmarred        no
4       5   50    primary  unemployed       0         married       yes


# Checking the reliability of data and data cleaning

<b> Interesting observation(s): </b>

⭐ 75.22% of the dataset contains are of non-violent households, indicating an class-unbalanced.

⭐ Around 86.08% of records contain an income of 0. There are many reasons why this may be the case: 
- Missing income data are default to 0
- There may be biases during the retrieval of data
- Domestic violence are more easily recorded among low-income households.

<b> Change(s) made: </b>

⭐ Remove leading and trailing spaces of String data and column labels

⭐ Added a column 'Age group' that group data in the column 'Age' into intervals of 10.

In [5]:
# Print out column labels for checking

print(dataframe.columns.tolist())

['SL. No', 'Age', 'Education ', 'Employment ', 'Income', 'Marital status ', 'Violence ']


In [6]:
# Noticeably, some of the column names contain trailing white spaces which we will remove.

dataframe.columns = dataframe.columns.str.strip()
print(dataframe.columns.tolist())

['SL. No', 'Age', 'Education', 'Employment', 'Income', 'Marital status', 'Violence']


In [7]:
# Count NA values in each column
dataframe.isna().sum()

SL. No            0
Age               0
Education         0
Employment        0
Income            0
Marital status    0
Violence          0
dtype: int64

In [8]:
# Make SL. No my index column

dataframe = dataframe.set_index('SL. No')

In [9]:
# Ensure consistency of data type of each column

dataframe.dtypes

Age                int64
Education         object
Employment        object
Income             int64
Marital status    object
Violence          object
dtype: object

In [10]:
# Check unique values of columns where the datatype is String
string_cols = dataframe.select_dtypes(include='object').columns

# Print out the unique values of these columns
for col in string_cols:
    print(f"--- {col} ---")
    print(dataframe[col].unique())
    print()

--- Education ---
['secondary' 'tertiary' 'primary' 'none']

--- Employment ---
['unemployed' 'semi employed' 'employed' 'employed ']

--- Marital status ---
['married' 'unmarred']

--- Violence ---
['yes' 'no']



In [11]:
# Noticeably, some of the data contains trailing spaces which we will remove. 

# Remove trailing spaces for String data type
dataframe = dataframe.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

string_cols = dataframe.select_dtypes(include='object').columns

# Present the new unique values of String columns
for col in string_cols:
    print(f"--- {col} ---")
    print(dataframe[col].unique())
    print()

--- Education ---
['secondary' 'tertiary' 'primary' 'none']

--- Employment ---
['unemployed' 'semi employed' 'employed']

--- Marital status ---
['married' 'unmarred']

--- Violence ---
['yes' 'no']



In [12]:
# Check if the dataset is class-balanced (roughly equal number of violent and non-violent relationship)
# Result shows that there are more non-violent data than violent.

# Calculate the total number of data
total_data = len(dataframe['Violence'])

# Calculate total number of yes and no in the Violence column
count_per_categories = dataframe['Violence'].value_counts()

# Calculate the percentage
category_percentage = (count_per_categories / total_data) * 100

# Present the percentage in a DataFrame
violence_statistics = pd.DataFrame({
    'Count per categories': count_per_categories,
    'Percentage': category_percentage.round(2)  
})

print(violence_statistics)

          Count per categories  Percentage
Violence                                  
no                         261       75.22
yes                         86       24.78


In [13]:
# Obtain the data information of numeric columns including min, max, mode, average
# An interesting observation seen is that the Q1, median and Q3 for the Income column is zero
# This indicates that most households in the dataset is low-income, as shown below.

dataframe.describe().round(1)

Unnamed: 0,Age,Income
count,347.0,347.0
mean,31.4,2110.7
std,9.6,5743.3
min,15.0,0.0
25%,23.0,0.0
50%,30.0,0.0
75%,39.5,0.0
max,60.0,35000.0


In [14]:
# Calculate the percentage of income level that made up the top 10 income level based on frequency.

# Calculate the total number of each income level for the top 10 income level.
count_per_top_10_income = dataframe['Income'].value_counts().head(10)

# Calculate the total number of data points
income_count = count_per_top_10_income.sum()

# Calculate the income percentage
income_percentage = (count_per_top_10_income / income_count) * 100

# Create a DataFrame of the data
income_summary = pd.DataFrame({
    'Count': count_per_top_10_income,
    'Percentage': income_percentage.round(2)  
})

print(income_summary)

        Count  Percentage
Income                   
0         272       86.08
5000        8        2.53
3000        6        1.90
2000        5        1.58
22000       5        1.58
4000        4        1.27
15000       4        1.27
6000        4        1.27
2500        4        1.27
10000       4        1.27


In [15]:
# Because age is continuous, it might be helpful to the model to group entries according to age groups
# Note: it has been previously established that the minimum age is 15 and maximum age is 60.

# Define the bin edges for age groups (ages 15 to 65 in 10-year intervals)
bin_edges = [15, 25, 35, 45, 55, 65]

# Define the labels for each corresponding age group
age_groups = [
    "15 ≤ age < 25",
    "25 ≤ age < 35",
    "35 ≤ age < 45",
    "45 ≤ age < 55",
    "55 ≤ age < 65"
]

# Create a new column 'Age group' by binning the 'Age' column into defined intervals
# 'right=False' means the intervals are left-inclusive, i.e., [15,25) includes 15 but not 25
dataframe['Age group'] = pd.cut(dataframe['Age'], bins=bin_edges, labels=age_groups, right=False)

In [16]:
dataframe.head().style.set_properties()

Unnamed: 0_level_0,Age,Education,Employment,Income,Marital status,Violence,Age group
SL. No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,30,secondary,unemployed,0,married,yes,25 ≤ age < 35
2,47,tertiary,unemployed,0,married,no,45 ≤ age < 55
3,24,tertiary,unemployed,0,unmarred,no,15 ≤ age < 25
4,22,tertiary,unemployed,0,unmarred,no,15 ≤ age < 25
5,50,primary,unemployed,0,married,yes,45 ≤ age < 55


In [18]:
dataframe.to_csv('DV_cleaned_data.csv', index=False)