In [None]:
#Import needed library
import pandas as pd

In [89]:
#Checks first 10 rows of the data
df = pd.read_csv("StudentPerformance.csv")
df.head(10)


Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0
5,3,78,No,9,6,61.0
6,7,73,Yes,5,6,63.0
7,8,45,Yes,4,6,42.0
8,5,77,No,8,2,61.0
9,4,89,No,4,0,69.0


In [79]:
#Changes the column names to have no space to reduce syntax errors
df.columns = [
    "hours_studied",
    "previous_scores",
    "extracurricular",
    "sleep_hours",
    "practice_papers",
    "performance_index"
]

In [None]:

print(df.shape)
print(df.columns)
df.info()



In [80]:
#Check that the extracurricular is split into 2 variables(All Yes and No is the same)
df["extracurricular"].unique()
df["extracurricular"].value_counts()

extracurricular
No     5052
Yes    4948
Name: count, dtype: int64

In [81]:
#Changed the categorical values to binary so it can be compared to performance
df["extracurricular"] = (
    df["extracurricular"]
    .astype(str)
    .str.strip()
    .str.lower()
    .map({"yes": 1, "no": 0})
)

# Check if it worked
print(df["extracurricular"].value_counts())

extracurricular
0    5052
1    4948
Name: count, dtype: int64


In [82]:
df.describe().round(2)  #Calculate all the statistics for each varibale

Unnamed: 0,hours_studied,previous_scores,extracurricular,sleep_hours,practice_papers,performance_index
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.99,69.45,0.49,6.53,4.58,55.22
std,2.59,17.34,0.5,1.7,2.87,19.21
min,1.0,40.0,0.0,4.0,0.0,10.0
25%,3.0,54.0,0.0,5.0,2.0,40.0
50%,5.0,69.0,0.0,7.0,5.0,55.0
75%,7.0,85.0,1.0,8.0,7.0,71.0
max,9.0,99.0,1.0,9.0,9.0,100.0


In [83]:
#How hours studied affected performance
study_summary = (
    df.groupby("hours_studied")["performance_index"]
      .agg(["count", "mean", "median", "min", "max"])
      .round(2)
)

study_summary


Unnamed: 0_level_0,count,mean,median,min,max
hours_studied,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1152,44.12,44.0,10.0,79.0
2,1085,46.43,46.0,11.0,82.0
3,1119,49.78,50.0,15.0,84.0
4,1085,52.72,53.0,18.0,87.0
5,1094,55.52,55.0,23.0,91.0
6,1133,58.49,59.0,23.0,93.0
7,1129,60.23,60.0,28.0,96.0
8,1088,64.21,64.0,26.0,99.0
9,1115,65.73,65.0,32.0,100.0


In [84]:
#How hours slept affected performance
sleep_summary = (
    df.groupby("sleep_hours")["performance_index"]
      .agg(["count", "mean", "median"])
      .round(2)
)

sleep_summary


Unnamed: 0_level_0,count,mean,median
sleep_hours,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,1619,53.79,54.0
5,1606,54.8,55.0
6,1673,54.66,54.0
7,1676,55.28,55.0
8,1804,55.93,56.0
9,1622,56.81,57.0


In [85]:
#How extracurriculars affected performance
extra_summary = (
    df.groupby("extracurricular")["performance_index"]
      .agg(["count", "mean", "median"])
      .round(2)
)

extra_summary

Unnamed: 0_level_0,count,mean,median
extracurricular,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5052,54.76,55.0
1,4948,55.7,55.0


In [86]:
#How previous practice tests affected performance
practice_summary = (
    df.groupby("practice_papers")["performance_index"]
      .agg(["count", "mean", "median"])
      .round(2)
)

practice_summary


Unnamed: 0_level_0,count,mean,median
practice_papers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,951,52.95,53.0
1,978,54.61,55.0
2,930,55.26,56.0
3,1035,55.26,55.0
4,955,54.15,53.0
5,1028,55.45,56.0
6,1059,56.15,56.0
7,987,55.78,56.0
8,1026,55.45,55.0
9,1051,56.88,56.0


In [87]:
#See the relationship between each variable and check the strength of each relationship
corr = df.corr().round(3)
corr

Unnamed: 0,hours_studied,previous_scores,extracurricular,sleep_hours,practice_papers,performance_index
hours_studied,1.0,-0.012,0.004,0.001,0.017,0.374
previous_scores,-0.012,1.0,0.008,0.006,0.008,0.915
extracurricular,0.004,0.008,1.0,-0.023,0.013,0.025
sleep_hours,0.001,0.006,-0.023,1.0,0.004,0.048
practice_papers,0.017,0.008,0.013,0.004,1.0,0.043
performance_index,0.374,0.915,0.025,0.048,0.043,1.0


In [88]:
#Convert all the data to CSV to save it

study_summary.to_csv("study_hours_summary.csv")
sleep_summary.to_csv("sleep_hours_summary.csv")
extra_summary.to_csv("extracurricular_summary.csv")
practice_summary.to_csv("practice_summary.csv")
corr.to_csv("correlation_matrix.csv")
