In [2]:
import pandas as pd
import os

# Load the dataset (assuming the file is named 'students.csv')
df = pd.read_csv('datasets/student_data.csv')

# Basic Inspection
num_students = len(df)
num_female_students = (df['sex'] == 'F').sum()
ages_first_5 = df['age'].head(5).tolist()
num_gp_school = (df['school'] == 'GP').sum()
last_10_records = df.tail(10)
num_urban_students = df['address'].value_counts().get('U', 0)
famsize_gt_3 = (df['famsize'].apply(lambda x: int(x[2:]) if x.startswith('GT') else 3) > 3).sum()
unique_reasons = df['reason'].unique()
num_mother_guardian = df['guardian'].value_counts().get('mother', 0)

# Structure and Metadata
num_columns = df.shape[1]
column_dtypes = df.dtypes
non_null_G3 = df['G3'].notnull().sum()
memory_usage = df.info()
num_categorical = sum(df.dtypes == 'object')
column_names = df.columns.tolist()
num_numeric = sum(df.dtypes != 'object')
shape = df.shape
non_null_per_column = df.count()

# Statistical Summarization
avg_G3 = df['G3'].mean()
studytime_gt_10 = (df['studytime'] > 10).sum()
min_age, max_age = df['age'].min(), df['age'].max()
zero_absences = (df['absences'] == 0).sum()
median_famrel = df['famrel'].median()
unique_Mjob = df['Mjob'].nunique()
std_G1 = df['G1'].std()
walc_5 = (df['Walc'] == 5).sum()
avg_traveltime = df['traveltime'].mean()
failures_4 = (df['failures'] == 4).sum()
G2_75th = df['G2'].quantile(0.75)
health_1 = (df['health'] == 1).sum()
max_absences = df['absences'].max()
higher_edu = (df['higher'] == 'yes').sum()
avg_freetime = df['freetime'].mean()
unique_Fjob = df['Fjob'].nunique()
goout_dist = df['goout'].value_counts()
dalc_1 = (df['Dalc'] == 1).sum()
avg_Medu = df['Medu'].mean()
Pstatus_A = (df['Pstatus'] == 'A').sum()

# Missing Values and Anomalies
missing_G3 = df['G3'].isnull().sum()
total_missing = df.isnull().sum().sum()
non_null_internet = df['internet'].notnull().sum()
percent_missing_rows = (df.isnull().any(axis=1).mean()) * 100
age_out_of_range = ((df['age'] < 15) | (df['age'] > 22)).sum()
num_duplicates = df.duplicated().sum()
absences_gt_50 = (df['absences'] > 50).sum()
missing_by_column = df.isnull().sum().sort_values(ascending=False)
inconsistent_grades = ((df['G1'] > 20) | (df['G2'] > 20)).sum()
failures_out_of_range = ((df['failures'] < 1) | (df['failures'] > 4)).sum()

# Relationships and Patterns
female_G3_above_15 = ((df['sex'] == 'F') & (df['G3'] > 15)).sum()
avg_G3_MS = df[df['school'] == 'MS']['G3'].mean()
studytime_gt_5_G3_below_10 = df.query('studytime > 5 and G3 < 10').shape[0]
internet_no_G3_above_12 = df.query('internet == "no" and G3 > 12').shape[0]
avg_absences_famsup_yes = df.groupby('famsup')['absences'].mean().get('yes', None)
Mjob_teacher_G3_above_15 = df.query('Mjob == "teacher" and G3 > 15').shape[0]
corr_G1_G3 = df['G1'].corr(df['G3'])
traveltime_gt_2_G3_below_10 = df.query('traveltime > 2 and G3 < 10').shape[0]
avg_G3_higher_yes = df.groupby('higher')['G3'].mean().get('yes', None)
walc_gt_3_G3_below_8 = df.query('Walc > 3 and G3 < 8').shape[0]
reason_dist_G3_above_15 = df[df['G3'] > 15]['reason'].value_counts()
famsize_LE3_G3_above_12 = df.query('famsize == "LE3" and G3 > 12').shape[0]
avg_famrel_guardian_father = df.groupby('guardian')['famrel'].mean().get('father', None)
schoolsup_yes_G3_below_10 = df.query('schoolsup == "yes" and G3 < 10').shape[0]
corr_studytime_G3 = df['studytime'].corr(df['G3'])
address_R_G3_above_15 = df.query('address == "R" and G3 > 15').shape[0]
avg_goout_G3_below_10 = df[df['G3'] < 10]['goout'].mean()
paid_yes_G3_above_12 = df.query('paid == "yes" and G3 > 12').shape[0]
health_dist_G3_above_15 = df[df['G3'] > 15]['health'].value_counts()

# Basic Data Saving
df.to_csv('students_export.csv')
rows_saved_csv = pd.read_csv('students_export.csv').shape[0]
df.to_csv('students_export_no_index.csv', index=False)
file_size_csv = os.path.getsize('students_export_no_index.csv')
df[['school', 'sex', 'G1', 'G2', 'G3']].to_excel('students_selected.xlsx')
columns_in_excel = pd.read_excel('students_selected.xlsx').shape[1]
json_structure = df.to_json(orient='records')
# For Excel, if_exists='replace' will overwrite the sheet if it exists
# To verify sex column in CSV
sex_in_csv = pd.read_csv('students_export.csv')['sex'].equals(df['sex'])
gp_students = df[df['school'] == 'GP']
gp_students.to_json('gp_students.json', orient='records')
gp_records_saved = pd.read_json('gp_students.json').shape[0]
# To confirm G3 column in Excel
G3_in_excel = pd.read_excel('students_selected.xlsx')['G3'].equals(df['G3'])

# Print results (for demonstration, you can comment out or remove these in production)
print(f"Number of students: {num_students}")
print(f"Number of female students: {num_female_students}")
print(f"Ages of first 5 records: {ages_first_5}")
print(f"Number of students at GP school: {num_gp_school}")
print(f"Last 10 records:\n{last_10_records}")
print(f"Number of urban students: {num_urban_students}")
print(f"Family size > 3: {famsize_gt_3}")
print(f"Unique reasons: {unique_reasons}")
print(f"Mother as guardian: {num_mother_guardian}")
print(f"Number of columns: {num_columns}")
print(f"Column dtypes:\n{column_dtypes}")
print(f"Non-null G3 values: {non_null_G3}")
print(f"Memory usage: {memory_usage}")
print(f"Categorical columns: {num_categorical}")
print(f"Column names: {column_names}")
print(f"Numeric columns: {num_numeric}")
print(f"Shape: {shape}")
print(f"Non-null per column:\n{non_null_per_column}")
print(f"Average G3: {avg_G3}")
print(f"Studytime > 10 hours: {studytime_gt_10}")
print(f"Min/Max age: {min_age}, {max_age}")
print(f"Zero absences: {zero_absences}")
print(f"Median famrel: {median_famrel}")
print(f"Unique Mjob: {unique_Mjob}")
print(f"Std G1: {std_G1}")
print(f"Walc=5: {walc_5}")
print(f"Avg traveltime: {avg_traveltime}")
print(f"Failures=4: {failures_4}")
print(f"G2 75th percentile: {G2_75th}")
print(f"Health=1: {health_1}")
print(f"Max absences: {max_absences}")
print(f"Higher education: {higher_edu}")
print(f"Avg freetime: {avg_freetime}")
print(f"Unique Fjob: {unique_Fjob}")
print(f"Goout distribution:\n{goout_dist}")
print(f"Dalc=1: {dalc_1}")
print(f"Avg Medu: {avg_Medu}")
print(f"Pstatus=A: {Pstatus_A}")
print(f"Missing G3: {missing_G3}")
print(f"Total missing: {total_missing}")
print(f"Non-null internet: {non_null_internet}")
print(f"Percent missing rows: {percent_missing_rows}")
print(f"Age out of range: {age_out_of_range}")
print(f"Duplicate rows: {num_duplicates}")
print(f"Absences > 50: {absences_gt_50}")
print(f"Missing by column:\n{missing_by_column}")
print(f"Inconsistent grades: {inconsistent_grades}")
print(f"Failures out of range: {failures_out_of_range}")
print(f"Female G3>15: {female_G3_above_15}")
print(f"Avg G3 MS: {avg_G3_MS}")
print(f"Studytime>5 & G3<10: {studytime_gt_5_G3_below_10}")
print(f"Internet=no & G3>12: {internet_no_G3_above_12}")
print(f"Avg absences famsup=yes: {avg_absences_famsup_yes}")
print(f"Mjob=teacher & G3>15: {Mjob_teacher_G3_above_15}")
print(f"Corr G1/G3: {corr_G1_G3}")
print(f"Traveltime>2 & G3<10: {traveltime_gt_2_G3_below_10}")
print(f"Avg G3 higher=yes: {avg_G3_higher_yes}")
print(f"Walc>3 & G3<8: {walc_gt_3_G3_below_8}")
print(f"Reason dist G3>15:\n{reason_dist_G3_above_15}")
print(f"Famsize=LE3 & G3>12: {famsize_LE3_G3_above_12}")
print(f"Avg famrel guardian=father: {avg_famrel_guardian_father}")
print(f"Schoolsup=yes & G3<10: {schoolsup_yes_G3_below_10}")
print(f"Corr studytime/G3: {corr_studytime_G3}")
print(f"Address=R & G3>15: {address_R_G3_above_15}")
print(f"Avg goout G3<10: {avg_goout_G3_below_10}")
print(f"Paid=yes & G3>12: {paid_yes_G3_above_12}")
print(f"Health dist G3>15:\n{health_dist_G3_above_15}")
print(f"Rows saved CSV: {rows_saved_csv}")
print(f"File size CSV: {file_size_csv}")
print(f"Columns in Excel: {columns_in_excel}")
print(f"JSON structure: {json_structure[:100]}...")  # Print first 100 chars
print(f"GP records saved: {gp_records_saved}")
print(f"G3 in Excel matches: {G3_in_excel}")
print(f"Sex in CSV matches: {sex_in_csv}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    