In [3]:
import pandas as pd
import numpy as np

# Headers for the CSV file
column_names = [
    'First Term Gpa',
    'Second Term Gpa',
    'First Language',
    'Funding',
    'School',
    'Fast Track',
    'Coop',
    'Residency',
    'Gender',
    'Prev Education',
    'Age Group',
    'High School Average Mark',
    'Math Score',
    'English Grade',
    'First Year Persistence'
]

# File path
file_path = '../Data/student_data.csv'

In [4]:
# Loading the CSV file into a DataFrame
try:
    df = pd.read_csv(
        file_path,
        header=None,          # Telling pandas there is no header row in the file
        names=column_names,   # Providing the list of correct column names
        na_values='?'         # Telling pandas to treat '?' character as missing (NaN)
    )
    print("DataFrame loaded successfully with correct headers and NaN values!")

    # Displaying the first 5 rows to verify headers and data
    print("\nFirst 5 rows:")
    print(df.head())

    # Displaying DataFrame info to verify column names, counts, and Dtypes
    print("\nDataFrame Info:")
    print(df.info())

except FileNotFoundError:
    print(f"\nError: Could not find the file at {file_path}")
    print("Please ensure the file exists and the path is correct.")
except Exception as e:
    print(f"\nAn error occurred while loading the file: {e}")

DataFrame loaded successfully with correct headers and NaN values!

First 5 rows:
   First Term Gpa  Second Term Gpa  First Language  Funding  School  \
0        0.000000         0.000000             1.0        2       6   
1        2.500000         2.000000             3.0        4       6   
2        4.250000         3.923077             1.0        1       6   
3        3.020833         2.321429             3.0        4       6   
4        4.275000         4.326923             1.0        2       6   

   Fast Track  Coop  Residency  Gender  Prev Education  Age Group  \
0           2     1          1       2             1.0        1.0   
1           1     2          2       2             1.0        3.0   
2           2     1          1       1             2.0        3.0   
3           1     2          2       2             2.0        3.0   
4           1     1          1       1             2.0        3.0   

   High School Average Mark  Math Score  English Grade  First Year Persisten

In [5]:
import pandas as pd

# Assuming 'df' is your loaded DataFrame where '?' is already NaN
# And assuming you have already replaced 0.0 with NaN in 'Prev Education' if desired
# df['Prev Education'] = df['Prev Education'].replace(0.0, np.nan) # Example if you did this

# Define the list of categorical (or numerically coded categorical) columns
# based on Description of Variables.docx [cite: 2]
categorical_columns_list = [
    'First Language',
    'Funding',
    'School',
    'Fast Track',
    'Coop',
    'Residency',
    'Gender',
    'Prev Education', # Remember 0.0 was identified here earlier
    'Age Group',
    'English Grade'
]

print("--- Value Counts for Categorical Columns ---")

# Loop through the list of categorical column names
for column in categorical_columns_list:
    # Check if the column actually exists in the DataFrame
    if column in df.columns:
        print(f"\n--- Counts for: {column} ---")
        # Print the value counts for the column
        # dropna=False ensures that NaN values are also counted if they exist
        print(df[column].value_counts(dropna=False))
    else:
        print(f"\nColumn '{column}' not found in DataFrame.")

print("\n--- End of Counts ---")

--- Value Counts for Categorical Columns ---

--- Counts for: First Language ---
First Language
1.0    720
3.0    602
NaN    111
2.0      4
Name: count, dtype: int64

--- Counts for: Funding ---
Funding
2    796
4    574
8     29
1     27
5     10
9      1
Name: count, dtype: int64

--- Counts for: School ---
School
6    1437
Name: count, dtype: int64

--- Counts for: Fast Track ---
Fast Track
2    1066
1     371
Name: count, dtype: int64

--- Counts for: Coop ---
Coop
2    999
1    438
Name: count, dtype: int64

--- Counts for: Residency ---
Residency
1    853
2    584
Name: count, dtype: int64

--- Counts for: Gender ---
Gender
2    1111
1     325
3       1
Name: count, dtype: int64

--- Counts for: Prev Education ---
Prev Education
1.0    863
2.0    482
0.0     88
NaN      4
Name: count, dtype: int64

--- Counts for: Age Group ---
Age Group
3.0    550
1.0    357
2.0    294
4.0    115
5.0     41
7.0     37
6.0     30
8.0      9
NaN      4
Name: count, dtype: int64

--- Counts for: En