In [None]:
# b-data exploration 
import pandas as pd

# Load the dataset
file_path = 'student-mat.csv'
df = pd.read_csv(file_path, sep=';')

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Display column data types
print("\nColumn data types:")
print(df.dtypes)

# Understand the dataset's size
print("\nDataset size (rows, columns):")
print(df.shape)


In [None]:
# c-data cleaning
import pandas as pd

df = pd.read_csv('student-mat.csv', sep=';')

# Option 1: Remove rows with missing values
df_no_missing = df.dropna()

# Option 2: Replace missing numeric values with median
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Replace missing categorical values with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Check if any missing values remain
missing_after_handling = df.isnull().sum().sum()
print("Missing values after handling:", missing_after_handling)
# Remove duplicate rows (based on all columns)
df_no_duplicates = df.drop_duplicates()

# Count duplicates removed
duplicates_removed = len(df) - len(df_no_duplicates)
print("Duplicates removed:", duplicates_removed)


In [None]:
# d- data analysis question
import pandas as pd

# Load the dataset
df = pd.read_csv('student-mat.csv', sep=';')

# 1. What is the average score in math (G3)?
avg_g3 = df['G3'].mean()
print(f"Average score in math (G3): {avg_g3:.2f}")

# 2. How many students scored above 15 in their final grade (G3)?
above_15 = df[df['G3'] > 15].shape[0]
print(f"Students scored above 15 in G3: {above_15}")

# 3. Is there a correlation between study time (studytime) and the final grade (G3)?
correlation = df['studytime'].corr(df['G3'])
print(f"Correlation between study time and G3: {correlation:.3f}")

# 4. Which gender has a higher average final grade (G3)?
avg_by_gender = df.groupby('sex')['G3'].mean()
print(f"Average G3 by gender:\n{avg_by_gender}")


In [None]:
# e- data visualisation
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('student-mat.csv', sep=';')

# 1. Histogram of final grades (G3)
plt.figure(figsize=(8,5))
plt.hist(df['G3'], bins=15, color='skyblue', edgecolor='black')
plt.title('Histogram of Final Grades (G3)')
plt.xlabel('Final Grade (G3)')
plt.ylabel('Number of Students')
plt.grid(axis='y')
plt.show()

# 2. Scatter plot between study time (studytime) and final grade (G3)
plt.figure(figsize=(8,5))
plt.scatter(df['studytime'], df['G3'], alpha=0.5)
plt.title('Scatter Plot: Study Time vs Final Grade (G3)')
plt.xlabel('Study Time (hours/week)')
plt.ylabel('Final Grade (G3)')
plt.grid(True)
plt.show()

# 3. Bar chart comparing the average scores of male and female students
avg_scores_gender = df.groupby('sex')['G3'].mean().reset_index()
plt.figure(figsize=(8,5))
plt.bar(avg_scores_gender['sex'], avg_scores_gender['G3'], color=['pink', 'lightblue'])
plt.title('Average Final Grades by Gender')
plt.xlabel('Gender')
plt.ylabel('Average Final Grade (G3)')
plt.grid(axis='y')
plt.show()
