# load libraries

In [2]:
# numpy
import numpy as np
np.set_printoptions(precision=2) # 用于设置浮点数在显示时的精度为两位小数

# pandas
import pandas as pd

# matplotlib
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LogisticRegression
    # 通常用于二分类问题, 多分类及有序分类要用其他的模型
from sklearn.model_selection import train_test_split
    # 允许你按指定的比例随机分割数据集，通常分为训练数据和测试数据。这是用来验证和测试模型性能的标准实践，有助于检测模型是否过拟合训练数据
from sklearn.metrics import accuracy_score
    # Calculate accuracy

# load dataset

In [3]:
data = pd.read_csv('Student_performance_data.csv')

# check data

In [5]:
# 显示数据的前几行，以检查数据是否正确读取
data.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


# data pre-processing

In [7]:
# 创建一个新的列 'Pass' 来表示成绩是否合格
# 这里使用 numpy 的 where 函数来根据条件赋值
data['Pass'] = np.where(data['GradeClass'] != 4, 1, 0)
# 如果 GradeClass 不等于 4，则 Pass 为 1；如果等于 4，则 Pass 为 0

In [8]:
# 再次查看是否处理成功
data.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass,Pass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0,1
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0,1
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0,0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0,1
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0,0


# split data

In [12]:
# 选择特征和目标变量
# 假设除了 StudentID 和新创建的 Pass 列之外，其他所有列都是特征
X = data.drop(['StudentID', 'GradeClass', 'GPA', 'Pass'], axis=1)  # 删除不是feature的列
y = data['Pass']  # 新的目标变量

print(X) # 需要时可查看

# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 将数据集分割为训练集和测试集，通常使用70%-30%或80%-20%的比例
# test_size=0.2 指定了测试集应占总数据集的 20%。这意味着剩余的 80% 数据将用作训练集。
# random_state=42 是一个随机数种子，用于控制数据分割的随机性。设置这个参数可以确保每次运行代码时数据的分割方式相同，这有助于实验结果的可重复性。


      Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  Absences  \
0      17       1          0                  2        19.833723         7   
1      18       0          0                  1        15.408756         0   
2      15       0          2                  3         4.210570        26   
3      17       1          0                  3        10.028829        14   
4      17       1          0                  2         4.672495        17   
...   ...     ...        ...                ...              ...       ...   
2387   18       1          0                  3        10.680555         2   
2388   17       0          0                  1         7.583217         4   
2389   16       1          0                  2         6.805500        20   
2390   16       1          1                  0        12.416653        17   
2391   16       1          0                  2        17.819907        13   

      Tutoring  ParentalSupport  Extracurricular  Sports  Music

# create model

In [14]:
# 创建模型
logistic_model = LogisticRegression(max_iter=200)

# 训练模型
logistic_model.fit(X_train, y_train)

# 模型预测
y_pred = logistic_model.predict(X_test)

# 查看模型参数
print(f"Coefficients: {logistic_model.coef_}")
print(f"Intercept: {logistic_model.intercept_}")

# 评估模型
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Coefficients: [[ 0.01 -0.02  0.11 -0.1   0.1  -0.37  0.7   0.5   0.56  0.39  0.32 -0.42]]
Intercept: [2.71]
Accuracy: 0.90
