In [48]:
import pandas as pd

In [49]:
students_df = pd.read_csv("../files/students_record.csv")

In [50]:
print("\n====================================== DATA EXPLORATION =================================================")
(rows, columns) = students_df.shape

print(f"\n{rows} rows, {columns} columns")
print("\n====================== DATA TYPES ========================================")
print(students_df.dtypes)


print("\n====================== MISSING VALUES ========================================")
print("\nMissing Values in each column")
print(students_df.isnull().sum())

print("\n====================== DATASET PREVIEW ========================================")
print(students_df.head())




20 rows, 11 columns

student_id             str
first_name             str
last_name              str
gender                 str
age                  int64
grade_level          int64
math_score           int64
science_score        int64
english_score        int64
history_score        int64
attendance_rate    float64
dtype: object


Missing Values in each column
student_id         0
first_name         0
last_name          0
gender             0
age                0
grade_level        0
math_score         0
science_score      0
english_score      0
history_score      0
attendance_rate    0
dtype: int64

  student_id first_name last_name gender  age  grade_level  math_score  \
0     STU001       Emma   Johnson      F   16           10          92   
1     STU002       Liam     Smith      M   15           10          78   
2     STU003     Olivia  Williams      F   16           10          95   
3     STU004       Noah     Brown      M   17           11          45   
4     STU005       

In [51]:
print(students_df.columns)

Index(['student_id', 'first_name', 'last_name', 'gender', 'age', 'grade_level',
       'math_score', 'science_score', 'english_score', 'history_score',
       'attendance_rate'],
      dtype='str')


In [52]:

columns = students_df.columns
subject_cols = []

for col in columns:
    if "_score" in col:
        subject_cols.append(col)

stats_summary = students_df[subject_cols].describe()

print(stats_summary)


       math_score  science_score  english_score  history_score
count    20.00000      20.000000      20.000000      20.000000
mean     80.60000      80.700000      81.600000      81.550000
std      12.17158       9.079184       9.848323       9.450564
min      45.00000      58.000000      65.000000      63.000000
25%      75.25000      75.500000      74.250000      77.500000
50%      83.00000      81.500000      84.500000      81.500000
75%      89.50000      87.250000      88.500000      88.750000
max      95.00000      96.000000      95.000000      94.000000


In [53]:
students_df["average_score"] = students_df[subject_cols].mean(axis=1)  # axis -1 means across rows while axis-0 meand down columns(default)

top_students = students_df.sort_values("average_score", ascending=False)
print(top_students.head(7))

   student_id first_name  last_name gender  age  grade_level  math_score  \
2      STU003     Olivia   Williams      F   16           10          95   
18     STU019     Evelyn    Jackson      F   15            9          89   
0      STU001       Emma    Johnson      F   16           10          92   
12     STU013        Mia   Gonzalez      F   17           11          86   
8      STU009   Isabella  Rodriguez      F   15            9          91   
15     STU016      Ethan     Thomas      M   17           11          91   
16     STU017     Amelia     Taylor      F   16           10          83   

    science_score  english_score  history_score  attendance_rate  \
2              96             88             94             0.98   
18             93             95             88             0.96   
0              88             95             78             0.97   
12             91             84             88             0.97   
8              82             88             85    

In [54]:
def determine_pass_or_fail(average_score):
    return average_score >= 75

    # You can also use the code below to return either "Pass" or "Fail" rather than booleans of True and False
    # return "Pass" if average_score >= 75 else "Fail"

students_df["pass_fail"] = students_df["average_score"].apply(determine_pass_or_fail)

pass_fail_counts = students_df["pass_fail"].value_counts()

# (20, 11)
# students_df.shapes[0]
# len(students_df)

print(f"\nPass Percentage: {pass_fail_counts[True] / len(students_df) * 100}%")


Pass Percentage: 85.0%
