# Cleaning Data with Pandas

## Import Libraries

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load the Dataset

In [47]:
df = pd.read_csv("../datasets/student_mental_health.csv")

## Overview of the Dataset

In [48]:
df.head()

Unnamed: 0,Timestamp,Choose your gender,Age,What is your course?,Your current year of Study,What is your CGPA?,Marital status,Do you have Depression?,Do you have Anxiety?,Do you have Panic attack?,Did you seek any specialist for a treatment?
0,8/7/2020 12:02,Female,18.0,Engineering,year 1,3.00 - 3.49,No,Yes,No,Yes,No
1,8/7/2020 12:04,Male,21.0,Islamic education,year 2,3.00 - 3.49,No,No,Yes,No,No
2,8/7/2020 12:05,Male,19.0,BIT,Year 1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,8/7/2020 12:06,Female,22.0,Laws,year 3,3.00 - 3.49,Yes,Yes,No,No,No
4,8/7/2020 12:13,Male,23.0,Mathemathics,year 4,3.00 - 3.49,No,No,No,No,No


## Rename Columns

In [49]:
# Rename all columns to lowercase
df.columns = df.columns.str.lower()
# Replace column: "Choose your gender" with "gender"
df.rename(columns={"choose your gender": "gender"}, inplace=True)
# Replace column: "What is your course?" with "course"
df.rename(columns={"what is your course?": "course"}, inplace=True)
# Rename column: "your current year of study" with "seniority"
df.rename(columns={"your current year of study": "seniority"}, inplace=True)
# Replace multiple columns at the same time
df.rename(columns={
    "what is your cgpa?": "gpa",
    "marital status": "married",
    "do you have depression?": "depressed",
    "do you have anxiety?": "anxiety",
    "do you have panic attack?": "panic_attacks",
    "did you seek any specialist for a treatment?": "treatment"
}, inplace=True)

## View Updated Columns

In [50]:
df.head()

Unnamed: 0,timestamp,gender,age,course,seniority,gpa,married,depressed,anxiety,panic_attacks,treatment
0,8/7/2020 12:02,Female,18.0,Engineering,year 1,3.00 - 3.49,No,Yes,No,Yes,No
1,8/7/2020 12:04,Male,21.0,Islamic education,year 2,3.00 - 3.49,No,No,Yes,No,No
2,8/7/2020 12:05,Male,19.0,BIT,Year 1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,8/7/2020 12:06,Female,22.0,Laws,year 3,3.00 - 3.49,Yes,Yes,No,No,No
4,8/7/2020 12:13,Male,23.0,Mathemathics,year 4,3.00 - 3.49,No,No,No,No,No


## Refactor Column Values

### Seniority String to Integer

In [51]:
# Change seniority to integer and remove extra words
df["seniority"] = df["seniority"].str.lower()
df["seniority"] = df["seniority"].str.replace("year ", "").astype(int)
df.head()

Unnamed: 0,timestamp,gender,age,course,seniority,gpa,married,depressed,anxiety,panic_attacks,treatment
0,8/7/2020 12:02,Female,18.0,Engineering,1,3.00 - 3.49,No,Yes,No,Yes,No
1,8/7/2020 12:04,Male,21.0,Islamic education,2,3.00 - 3.49,No,No,Yes,No,No
2,8/7/2020 12:05,Male,19.0,BIT,1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,8/7/2020 12:06,Female,22.0,Laws,3,3.00 - 3.49,Yes,Yes,No,No,No
4,8/7/2020 12:13,Male,23.0,Mathemathics,4,3.00 - 3.49,No,No,No,No,No


### Age Column to Integer

In [52]:
# Drop NaNs
df = df.dropna(subset=["age"])
df["age"] = df["age"].astype(int)
df.head()

Unnamed: 0,timestamp,gender,age,course,seniority,gpa,married,depressed,anxiety,panic_attacks,treatment
0,8/7/2020 12:02,Female,18,Engineering,1,3.00 - 3.49,No,Yes,No,Yes,No
1,8/7/2020 12:04,Male,21,Islamic education,2,3.00 - 3.49,No,No,Yes,No,No
2,8/7/2020 12:05,Male,19,BIT,1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,8/7/2020 12:06,Female,22,Laws,3,3.00 - 3.49,Yes,Yes,No,No,No
4,8/7/2020 12:13,Male,23,Mathemathics,4,3.00 - 3.49,No,No,No,No,No


### Yes and Nos to True and False

In [53]:
columns_to_convert = [
    "married",
    "depressed",
    "anxiety",
    "panic_attacks",
    "treatment"
]
# Convert Yes and No to True and False
df[columns_to_convert] = df[columns_to_convert].replace({
    "Yes": True,
    "No": False
})
# Set Data Types
df[columns_to_convert] = df[columns_to_convert].astype(bool)
df.head()

  df[columns_to_convert] = df[columns_to_convert].replace({


Unnamed: 0,timestamp,gender,age,course,seniority,gpa,married,depressed,anxiety,panic_attacks,treatment
0,8/7/2020 12:02,Female,18,Engineering,1,3.00 - 3.49,False,True,False,True,False
1,8/7/2020 12:04,Male,21,Islamic education,2,3.00 - 3.49,False,False,True,False,False
2,8/7/2020 12:05,Male,19,BIT,1,3.00 - 3.49,False,True,True,True,False
3,8/7/2020 12:06,Female,22,Laws,3,3.00 - 3.49,True,True,False,False,False
4,8/7/2020 12:13,Male,23,Mathemathics,4,3.00 - 3.49,False,False,False,False,False


## Custom Date to Datetime

In [54]:
# Convert 'timestamp' column to a valid datetime format
df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed')
df.head()

Unnamed: 0,timestamp,gender,age,course,seniority,gpa,married,depressed,anxiety,panic_attacks,treatment
0,2020-08-07 12:02:00,Female,18,Engineering,1,3.00 - 3.49,False,True,False,True,False
1,2020-08-07 12:04:00,Male,21,Islamic education,2,3.00 - 3.49,False,False,True,False,False
2,2020-08-07 12:05:00,Male,19,BIT,1,3.00 - 3.49,False,True,True,True,False
3,2020-08-07 12:06:00,Female,22,Laws,3,3.00 - 3.49,True,True,False,False,False
4,2020-08-07 12:13:00,Male,23,Mathemathics,4,3.00 - 3.49,False,False,False,False,False


# Use the Clean Data Easily

In [55]:
# Save the cleaned data to a new CSV file
df.to_csv("../datasets/updated/student_mental_health.csv", index=False)