In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy

In [81]:
#读取数据
data=pd.read_csv('./StudentsPerformance.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [None]:
#查看表单数据
labels=['race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
for label in labels:
    print(data[label].value_counts())
    print('*'*50)

In [None]:
#对成绩情况进行预处理
average=data[['math score', 'reading score', 'writing score']].mean(axis=1)
data.insert(8,'average score',np.round(average,decimals=2))
performance_level=pd.cut(data['average score'],bins=[0,60,70,80,90,100],labels=['F','D','C','B','A'])
data.insert(9,'performance level',performance_level)
data.sample(10)

In [None]:
plt.figure(num='Score Distribution',figsize=(5,8))
sns.set_style('darkgrid')
subjects = ['math score', 'reading score', 'writing score']
color = ['green', 'blue', 'orange']
column = 1
for subject in subjects:
    plt.subplot(len(subjects), 1, column)
    sns.kdeplot(data=data, x=subject, color=color[column - 1])
    column = column + 1
    plt.title("{} distribution".format(subject))
plt.tight_layout()
plt.show()

In [None]:
#随机抽样的卡方检验(根据男女比例确定该抽样是否随机)
gender=data['gender'].unique()
expected_number=[500,500]
observed_number=data['gender'].value_counts().values.tolist()
result=stats.chisquare(f_obs=observed_number,f_exp=expected_number)
print(result)

In [None]:
#成绩间的相关性
fig = plt.figure()
plt.subplot()
sns.heatmap(data[subjects].corr(), annot=True)
plt.show()

In [None]:
data.groupby('gender')[['math score']].agg([np.mean, np.median])

In [None]:
#检验学生成绩与性别间的关系
math_grading = pd.cut(data['math score'], bins=[0, 60, 70, 80, 90, 100],
                                   labels=['F', 'D', 'C', 'B', 'A'] )
crosstab = pd.crosstab(math_grading, data['gender'])
result = stats.chi2_contingency(crosstab)
result[1]

In [None]:
fig, ax  = plt.subplots(1,2, figsize=(10, 5))
sns.boxplot(data=data, y='math score', x='gender', palette='summer', ax=ax[0])
sns.histplot(data=data, x='math score', hue='gender', fill=True, ax=ax[1], stat='probability')
plt.show()


In [None]:
data.groupby('gender')[['writing score']].agg([np.mean, np.median])
fig = px.histogram(
    data, x='writing score',
    marginal='box', opacity=0.6,
    color='gender',
    histnorm='probability',
    title='男生与女生在文科上的表现',
    template='plotly_white'
)

fig.update_layout(barmode='overlay', width=800)
fig.show()

In [None]:
#检验学生成绩与父母受教育水平间的关系
honor_students = data.loc[data['average score']>=90] # 选取均分高于0.9的学生，组成子数据集honor_students
honor_count = honor_students['parental level of education'].value_counts()
total_count = data['parental level of education'].value_counts()

fig = make_subplots(rows=1, cols=2, specs=[[dict(type='domain'),{'type':'domain'}]])

fig.add_pie(
    values=total_count.values, hole=0.4, labels=total_count.index,
            row=1, col=1, name='整体学生父母受教育程度'
)
fig.add_pie(
    values=honor_count.values, hole=0.4, labels=honor_count.index,
            row=1, col=2, name='高分学生父母受教育程度'
)

fig.update_layout(
    title_text="学生父母受教育程度",
    annotations=[dict(text='整体父母', x=0.15, y=0.5, font_size=20, showarrow=False),
                 dict(text='高分父母', x=0.85, y=0.5, font_size=20, showarrow=False)],
    width=900
)
fig.show()

In [None]:
crosstab = pd.crosstab(data['parental level of education'], data['performance level'])
result = stats.chi2_contingency(crosstab)
result[1]