# Student Performance - Exploratory Data Analysis

This notebook contains the exploratory data analysis for the student performance prediction project.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('../../artifacts/data_ingestion/data.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())

# Display first few rows
df.head()

In [None]:
# Statistical summary
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

# Data types
print("\nData Types:")
print(df.dtypes)

In [None]:
# Visualizations
plt.figure(figsize=(15, 10))

# Distribution of math scores
plt.subplot(2, 3, 1)
sns.histplot(df['math_score'], kde=True)
plt.title('Distribution of Math Scores')

# Gender vs Math Score
plt.subplot(2, 3, 2)
sns.boxplot(x='gender', y='math_score', data=df)
plt.title('Math Score by Gender')

# Race/Ethnicity vs Math Score
plt.subplot(2, 3, 3)
sns.boxplot(x='race_ethnicity', y='math_score', data=df)
plt.title('Math Score by Race/Ethnicity')
plt.xticks(rotation=45)

# Parental Education vs Math Score
plt.subplot(2, 3, 4)
sns.boxplot(x='parental_level_of_education', y='math_score', data=df)
plt.title('Math Score by Parental Education')
plt.xticks(rotation=45)

# Lunch vs Math Score
plt.subplot(2, 3, 5)
sns.boxplot(x='lunch', y='math_score', data=df)
plt.title('Math Score by Lunch Type')

# Test Preparation vs Math Score
plt.subplot(2, 3, 6)
sns.boxplot(x='test_preparation_course', y='math_score', data=df)
plt.title('Math Score by Test Preparation')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = df[['math_score', 'reading_score', 'writing_score']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Scores')
plt.show()