In [1]:
import pandas as pd
import seaborn as sns
import scipy.stats as stats

In [2]:
columns_name = [
    'timestamp','sex','matric_gpa_%' ,'study_year','faculty','gpa_%_2023','accommodation_status',
    'monthly_allowance','scholarship','study_hours_week','socialising_week','drinks_night',
    'classes_missed','modules_failed','in_relationship','parental_approval','relationship_parents'
]

survey_df = pd.read_csv("student_survey.csv", names = columns_name, header=0)

## Manipulation and Cleaning Data

In [3]:
survey_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   timestamp             406 non-null    object 
 1   sex                   404 non-null    object 
 2   matric_gpa_%          399 non-null    float64
 3   study_year            333 non-null    object 
 4   faculty               399 non-null    object 
 5   gpa_%_2023            320 non-null    float64
 6   accommodation_status  383 non-null    object 
 7   monthly_allowance     375 non-null    object 
 8   scholarship           398 non-null    object 
 9   study_hours_week      403 non-null    object 
 10  socialising_week      404 non-null    object 
 11  drinks_night          404 non-null    object 
 12  classes_missed        403 non-null    object 
 13  modules_failed        403 non-null    object 
 14  in_relationship       403 non-null    object 
 15  parental_approval     4

> Colums and Rows 
> outlier
> Data Error Handling
> Null Values Handling
> Feature selection

In [4]:
# Deleting Timestamp column
survey_df.drop(["timestamp"], axis = 1, inplace =True)

In [5]:
survey_df.head()

Unnamed: 0,sex,matric_gpa_%,study_year,faculty,gpa_%_2023,accommodation_status,monthly_allowance,scholarship,study_hours_week,socialising_week,drinks_night,classes_missed,modules_failed,in_relationship,parental_approval,relationship_parents
0,Female,76.0,2nd Year,Arts & Social Sciences,72.0,Private accommodation/ stay with family/friends,R 4001- R 5000,No,8+,Only weekends,8+,3,0,Yes,Yes,Very close
1,Male,89.0,2nd Year,Economic & Management Sciences,75.0,Private accommodation/ stay with family/friends,R 7001 - R 8000,"Yes (NSFAS, etc...)",8+,Only weekends,3-5,4+,0,No,Yes,Very close
2,Male,76.0,1st Year,AgriSciences,55.0,Private accommodation/ stay with family/friends,R 4001- R 5000,No,3-5,2,8+,3,0,No,Yes,Very close
3,Male,89.0,2nd Year,Engineering,84.0,Private accommodation/ stay with family/friends,R 6001 - R 7000,No,3-5,3,8+,2,0,Yes,Yes,Very close
4,Female,74.0,2nd Year,Arts & Social Sciences,52.0,Private accommodation/ stay with family/friends,R 4001- R 5000,No,3-5,Only weekends,5-8,1,3,No,Yes,Fair


In [6]:
survey_df.shape

(406, 16)

In [7]:
# Outlier Handling
num_columns = ['matric_gpa_%','gpa_%_2023']
survey_df.describe()

Unnamed: 0,matric_gpa_%,gpa_%_2023
count,399.0,320.0
mean,77.989724,66.268469
std,7.048618,9.147906
min,34.0,30.0
25%,74.0,60.0
50%,78.0,65.0
75%,83.0,73.0
max,99.0,95.22


In [12]:
# Handling outlier with IQR
def handle_outlier(dataframe: pd.DataFrame, columns : list[str]):
    
    all_index_to_drop = []
    
    for every in columns:
        q1 = dataframe[every].quantile(0.25)
        q3 = dataframe[every].quantile(0.75)
        max_value = 1.5 * (q3 - q1) + q3 
        min_value = q1 - 1.5 * (q3 - q1)

        to_drop_index = dataframe[every][(dataframe[every] > max_value) | (dataframe[every] < min_value)].index
        
        dataframe = dataframe.drop(to_drop_index, axis = 0, inplace=False)
        
        all_index_to_drop.extend(to_drop_index)
        
    return dataframe, all_index_to_drop

In [13]:
survey_df_no_outlier , info = handle_outlier(survey_df, num_columns)

In [18]:
# HAndling Null Values
survey_df_no_outlier.isna().sum()

sex                      2
matric_gpa_%             7
study_year              72
faculty                  7
gpa_%_2023              84
accommodation_status    23
monthly_allowance       31
scholarship              8
study_hours_week         3
socialising_week         2
drinks_night             2
classes_missed           3
modules_failed           3
in_relationship          3
parental_approval        3
relationship_parents     3
dtype: int64

In [31]:
survey_df_no_outlier = survey_df_no_outlier[~ (survey_df_no_outlier.isna().sum(axis=1) >=5)]

In [36]:
survey_df_no_outlier.isna().sum()

sex                      0
matric_gpa_%             5
study_year               0
faculty                  5
gpa_%_2023              78
accommodation_status    17
monthly_allowance       25
scholarship              2
study_hours_week         1
socialising_week         0
drinks_night             0
classes_missed           1
modules_failed           1
in_relationship          1
parental_approval        1
relationship_parents     1
dtype: int64

In [33]:
survey_df_no_outlier.study_year.value_counts(dropna= False)

study_year
2nd Year        149
1st Year        124
NaN              66
3rd Year         40
4th Year          7
Postgraduate      5
Name: count, dtype: int64

In [34]:
survey_df_no_outlier["study_year"].fillna(value = "0th Year", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  survey_df_no_outlier["study_year"].fillna(value = "0th Year", inplace=True)


In [35]:
survey_df_no_outlier.study_year.value_counts(dropna= False)

study_year
2nd Year        149
1st Year        124
0th Year         66
3rd Year         40
4th Year          7
Postgraduate      5
Name: count, dtype: int64

In [39]:
grouped = survey_df_no_outlier[["matric_gpa_%", "faculty"]].groupby("faculty")

In [43]:
survey_df_no_outlier['matric_gpa_%'].fillna(grouped['matric_gpa_%'].transform('mean'), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  survey_df_no_outlier['matric_gpa_%'].fillna(grouped['matric_gpa_%'].transform('mean'), inplace = True)


In [45]:
survey_df_no_outlier.isna().sum()

sex                      0
matric_gpa_%             0
study_year               0
faculty                  5
gpa_%_2023              78
accommodation_status    17
monthly_allowance       25
scholarship              2
study_hours_week         1
socialising_week         0
drinks_night             0
classes_missed           1
modules_failed           1
in_relationship          1
parental_approval        1
relationship_parents     1
dtype: int64

In [47]:
# Replace categorical values with it mode
cat_cols = [
    'faculty','study_hours_week','monthly_allowance', 'classes_missed',
    'modules_failed','in_relationship','parental_approval','relationship_parents'
]
for each in cat_cols:
    mode_val = survey_df_no_outlier[each].mode()[0]
    survey_df_no_outlier[each].fillna(mode_val, inplace = True)

# Creating new feature with NaN value
unknown_nan = ['accommodation_status','scholarship']
for each in unknown_nan:
    survey_df_no_outlier[each].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  survey_df_no_outlier[each].fillna(mode_val, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  survey_df_no_outlier[each].fillna('Unknown', inplace=True)


In [48]:
survey_df_no_outlier.isna().sum()

sex                      0
matric_gpa_%             0
study_year               0
faculty                  0
gpa_%_2023              78
accommodation_status     0
monthly_allowance        0
scholarship              0
study_hours_week         0
socialising_week         0
drinks_night             0
classes_missed           0
modules_failed           0
in_relationship          0
parental_approval        0
relationship_parents     0
dtype: int64

In [51]:
survey_df_no_outlier.dropna(inplace = True)

In [54]:
survey_df_no_outlier.info()

<class 'pandas.core.frame.DataFrame'>
Index: 313 entries, 0 to 405
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   sex                   313 non-null    object 
 1   matric_gpa_%          313 non-null    float64
 2   study_year            313 non-null    object 
 3   faculty               313 non-null    object 
 4   gpa_%_2023            313 non-null    float64
 5   accommodation_status  313 non-null    object 
 6   monthly_allowance     313 non-null    object 
 7   scholarship           313 non-null    object 
 8   study_hours_week      313 non-null    object 
 9   socialising_week      313 non-null    object 
 10  drinks_night          313 non-null    object 
 11  classes_missed        313 non-null    object 
 12  modules_failed        313 non-null    object 
 13  in_relationship       313 non-null    object 
 14  parental_approval     313 non-null    object 
 15  relationship_parents  313 no