In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant


In [3]:
df = pd.read_excel("students_internet_performance.xls")
df.head() #проеверка на адекватность

Unnamed: 0.1,Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,...,goout,health,failures_PORTUGESE,paid_PORTUGESE,absences_PORTUGESE,finalGrade_PORTUGESE,failures_MATH,paid_MATH,absences_MATH,finalGrade_MATH
0,0,GP,F,18,U,GT3,A,4,4,at_home,...,4,3,0,no,4,11,0,no,6,6
1,1,GP,F,17,U,GT3,T,1,1,at_home,...,3,3,0,no,2,11,0,no,4,6
2,2,GP,F,15,U,LE3,T,1,1,at_home,...,2,3,0,no,6,12,3,yes,10,10
3,3,GP,F,15,U,GT3,T,4,2,health,...,2,5,0,no,0,14,0,yes,2,15
4,4,GP,F,16,U,GT3,T,3,3,other,...,2,5,0,no,0,13,0,yes,4,10


In [4]:
categorical_vars = df.select_dtypes(include=['object', 'category']).columns
quantitative_vars = df.select_dtypes(include=['int64', 'float64']).columns

print(f"Categorical variables are: {categorical_vars}")
print(f"Quantitative variables are: {quantitative_vars}")

Categorical variables are: Index(['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob',
       'reason', 'schoolsup', 'famsup', 'activities', 'nursery', 'higher',
       'internet', 'romantic', 'paid_PORTUGESE', 'paid_MATH'],
      dtype='object')
Quantitative variables are: Index(['Unnamed: 0', 'age', 'Medu', 'Fedu', 'studytime', 'famrel', 'freetime',
       'goout', 'health', 'failures_PORTUGESE', 'absences_PORTUGESE',
       'finalGrade_PORTUGESE', 'failures_MATH', 'absences_MATH',
       'finalGrade_MATH'],
      dtype='object')


In [5]:
# Описательные статистики для количественных переменных
quantitative_desc = df[quantitative_vars].describe()
# Описательные статистики для категориальных переменных
categorical_desc = df[categorical_vars].describe()

# Проверка пропущенных значений
missing_values = df.isnull().sum()

print(quantitative_desc)
print(categorical_desc)
print(missing_values)

       Unnamed: 0         age        Medu        Fedu   studytime      famrel  \
count   370.00000  370.000000  370.000000  370.000000  370.000000  370.000000   
mean    184.50000   16.575676    2.800000    2.556757    2.043243    3.935135   
std     106.95404    1.180672    1.083505    1.086138    0.847934    0.911301   
min       0.00000   15.000000    0.000000    0.000000    1.000000    1.000000   
25%      92.25000   16.000000    2.000000    2.000000    1.000000    4.000000   
50%     184.50000   17.000000    3.000000    3.000000    2.000000    4.000000   
75%     276.75000   17.000000    4.000000    3.750000    2.000000    5.000000   
max     369.00000   22.000000    4.000000    4.000000    4.000000    5.000000   

         freetime       goout      health  failures_PORTUGESE  \
count  370.000000  370.000000  370.000000          370.000000   
mean     3.224324    3.116216    3.562162            0.132432   
std      0.985506    1.128592    1.407558            0.489762   
min      1

In [6]:
# Расчет доли учеников с доступом к интернету
internet_access_ratio = df['internet'].value_counts(normalize=True)
print(internet_access_ratio)

internet
yes    0.845946
no     0.154054
Name: proportion, dtype: float64


In [7]:
# Корреляция Пирсона между оценками
correlation = df[['finalGrade_PORTUGESE', 'finalGrade_MATH']].corr(method='pearson')
print(correlation)

                      finalGrade_PORTUGESE  finalGrade_MATH
finalGrade_PORTUGESE              1.000000         0.493134
finalGrade_MATH                   0.493134         1.000000


In [8]:
# Выбор переменных, отражающих социально-демографический статус
socio_demographic_vars = ['Medu', 'Fedu', 'Mjob', 'Fjob']
# Описательная статистика для социально-демографических переменных
socio_demographic_desc = df[socio_demographic_vars].describe()
print(socio_demographic_desc)


             Medu        Fedu
count  370.000000  370.000000
mean     2.800000    2.556757
std      1.083505    1.086138
min      0.000000    0.000000
25%      2.000000    2.000000
50%      3.000000    3.000000
75%      4.000000    3.750000
max      4.000000    4.000000


In [9]:
internet_performance_correlation = df.groupby('internet')[['finalGrade_PORTUGESE', 'finalGrade_MATH']].mean()
print(internet_performance_correlation)


          finalGrade_PORTUGESE  finalGrade_MATH
internet                                       
no                   11.982456         9.578947
yes                  12.658147        10.626198


In [16]:
from sklearn.preprocessing import LabelEncoder

# Encode the categorical target variable
le = LabelEncoder()
df['internet'] = le.fit_transform(df['internet'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[['finalGrade_PORTUGESE', 'finalGrade_MATH']], df['internet'], test_size=0.2, random_state=42)

# Create the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions for the test set
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.11904099677473057


In [14]:
from scipy import stats

# Проверка связи между оценками по математике и португальскому языку
t_statistic, p_value = stats.ttest_ind(df['finalGrade_MATH'], df['finalGrade_PORTUGESE'])
print(f"T-статистика: {t_statistic}")
print(f"P-значение: {p_value}")

T-статистика: -7.348463679214774
P-значение: 5.326862450134039e-13


In [15]:
from sklearn.tree import DecisionTreeClassifier

# Создание модели решающего дерева
model = DecisionTreeClassifier()

# Обучение модели
model.fit(df[['finalGrade_PORTUGESE', 'finalGrade_MATH']], df['internet'])

# Предсказание для новых данных
y_pred = model.predict(X_test)