#Data Understanding

dataset ini diambil dari : https://www.kaggle.com/datasets/spscientist/students-performance-in-exams

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/content/drive/MyDrive/baru/StudentsPerformance.csv")
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


Memeriksa struktur data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


Memeriksa tipe data

In [17]:
df.dtypes

Unnamed: 0,0
gender,object
race/ethnicity,object
parental level of education,object
lunch,object
test preparation course,object
math score,int64
reading score,int64
writing score,float64
lulus_semua,object


MEMERIKSA STATISTIK DESKRIPTIF

In [4]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


Analisis Korelasi

In [5]:
print(df.corr(numeric_only=True))

               math score  reading score  writing score
math score       1.000000       0.817580       0.802642
reading score    0.817580       1.000000       0.954598
writing score    0.802642       0.954598       1.000000


#Data Preparation

Mencari Missing Values

In [7]:
(df.isna().sum()/len(df))*100

Unnamed: 0,0
gender,0.0
race/ethnicity,0.0
parental level of education,0.0
lunch,0.0
test preparation course,0.0
math score,0.0
reading score,0.0
writing score,0.0


Mencari Data Duplikat

In [8]:
df[df.duplicated()]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score


Outliers

In [9]:
results = []

cols = df.select_dtypes(include=['float64', 'int64'])

for col in cols:
  q1 = df[col].quantile(0.25)
  q3 = df[col].quantile(0.75)
  iqr = q3 - q1
  lower_bound = q1 - 1.5*iqr
  upper_bound = q3 + 1.5*iqr
  outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
  percent_outliers = (len(outliers)/len(df))*100
  results.append({'Kolom': col, 'Persentase Outliers': percent_outliers})

# Dataframe dari list hasil
results_df = pd.DataFrame(results)
results_df.set_index('Kolom', inplace=True)
results_df = results_df.rename_axis(None, axis=0).rename_axis('Kolom', axis=1)

# Tampilkan dataframe
display(results_df)

Kolom,Persentase Outliers
math score,0.8
reading score,0.6
writing score,0.5


Melakukan imputasi

In [12]:
columns_to_impute = ["math score", "reading score", "writing score"]

for col in columns_to_impute:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Menggunakan .loc[] agar tidak muncul SettingWithCopyWarning
    df.loc[:, col] = df[col].clip(lower=lower_bound, upper=upper_bound)

Memeriksa kembali

In [13]:
results = []

cols = df.select_dtypes(include=['float64', 'int64'])

for col in cols:
  q1 = df[col].quantile(0.25)
  q3 = df[col].quantile(0.75)
  iqr = q3 - q1
  lower_bound = q1 - 1.5*iqr
  upper_bound = q3 + 1.5*iqr
  outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
  percent_outliers = (len(outliers)/len(df))*100
  results.append({'Kolom': col, 'Persentase Outliers': percent_outliers})

# Dataframe dari list hasil
results_df = pd.DataFrame(results)
results_df.set_index('Kolom', inplace=True)
results_df = results_df.rename_axis(None, axis=0).rename_axis('Kolom', axis=1)

# Tampilkan dataframe
display(results_df)

Kolom,Persentase Outliers
math score,0.0
reading score,0.0
writing score,0.0


Konstruk Data

menamahkan kolom lulus_semua agar dapat menegetahui kinerja setiap individu

In [14]:

def lulus_semua(row):
    batas_lulus = 60  # Batas skor kelulusan
    if row['math score'] >= batas_lulus and row['reading score'] >= batas_lulus and row['writing score'] >= batas_lulus:
        return 'Lulus'
    else:
        return 'Tidak Lulus'

# membuat kolom lulus_semua
df['lulus_semua'] = df.apply(lulus_semua, axis=1)

print(df[['math score', 'reading score', 'writing score', 'lulus_semua']].head())

   math score  reading score  writing score  lulus_semua
0          72             72           74.0        Lulus
1          69             90           88.0        Lulus
2          90             95           93.0        Lulus
3          47             57           44.0  Tidak Lulus
4          76             78           75.0        Lulus


In [15]:
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,lulus_semua
0,female,group B,bachelor's degree,standard,none,72,72,74.0,Lulus
1,female,group C,some college,standard,completed,69,90,88.0,Lulus
2,female,group B,master's degree,standard,none,90,95,93.0,Lulus
3,male,group A,associate's degree,free/reduced,none,47,57,44.0,Tidak Lulus
4,male,group C,some college,standard,none,76,78,75.0,Lulus
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95.0,Lulus
996,male,group C,high school,free/reduced,none,62,55,55.0,Tidak Lulus
997,female,group C,high school,free/reduced,completed,59,71,65.0,Tidak Lulus
998,female,group D,some college,standard,completed,68,78,77.0,Lulus
