In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:

# Load Students Performance dataset from Kaggle
df = pd.read_csv("StudentsPerformance.csv")

# Example students for explanation:
# Aarav, Ananya, Rohan, Priya


In [None]:

df.head()

# Example rows:
# Aarav  | male   | group B | bachelor's degree | standard | none      | 72 | 72 | 74
# Ananya| female | group C | some college      | standard | completed | 69 | 90 | 88
# Rohan | male   | group B | master's degree   | standard | none      | 90 | 95 | 93


In [None]:

df.tail()

# Example:
# Priya | female | group E | high school | free/reduced | completed | 57 | 56 | 54


In [None]:

df.info()


In [None]:

df.isnull().sum()

# Example:
# Aarav, Ananya, Rohan, Priya have no missing values


In [None]:

df.duplicated().sum()

# Example:
# No duplicate student records


In [None]:

df = df.drop_duplicates()


In [None]:

df.columns

# Before renaming:
# 'math score', 'reading score', 'writing score'


In [None]:

df.columns = [
    'gender',
    'race_ethnicity',
    'parent_education',
    'lunch',
    'test_prep',
    'math_score',
    'reading_score',
    'writing_score'
]

# Example:
# Aarav → math_score = 72


In [None]:

df[(df['math_score'] < 0) | (df['math_score'] > 100)]

# Example:
# Aarav math = 72 ✔
# Rohan math = 90 ✔


In [None]:

df[(df['reading_score'] < 0) | (df['writing_score'] > 100)]


In [None]:

df['gender'].value_counts()

# Example:
# Aarav → male
# Ananya → female


In [None]:

df['race_ethnicity'].unique()

# Example:
# group B, group C, group E


In [None]:

df['test_prep'].value_counts()

# Example:
# Ananya → completed
# Aarav → none


In [None]:

df[['math_score','reading_score','writing_score']].describe()


In [None]:

df['average_score'] = (
    df['math_score'] +
    df['reading_score'] +
    df['writing_score']
) / 3

# Example:
# Aarav  → 72.67
# Ananya→ 82.33
# Priya → 55.67


In [None]:

df[['math_score','reading_score','writing_score','average_score']].head()


In [None]:

def performance(avg):
    if avg >= 75:
        return 'High'
    elif avg >= 50:
        return 'Medium'
    else:
        return 'Low'

df['performance_level'] = df['average_score'].apply(performance)

# Example:
# Rohan → High
# Aarav → Medium
# Low scorer → Low


In [None]:

df[['average_score','performance_level']].head()


In [None]:

plt.figure(figsize=(8,5))
sns.boxplot(data=df[['math_score','reading_score','writing_score']])
plt.show()


In [None]:

df.describe(include='all')


In [None]:

df.to_csv("StudentsPerformance_Cleaned.csv", index=False)

# Output:
# StudentsPerformance_Cleaned.csv
