In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from itertools import combinations
from scipy.stats import ttest_ind

pd.set_option('display.max_rows', 50)  # показывать больше строк
pd.set_option('display.max_columns', 50)  # показывать больше колонок

students = pd.read_csv('stud_math.csv')

# data preparation
# because this data is useless for the task
students.drop(['school'], inplace=True, axis=1)
students = students.dropna(subset=['score'])
# rename studytime, granular
students.rename(columns=lambda x: x.replace(
    'studytime, granular', 'studytime_granular'), inplace=True)

# select numeric columns
df_numeric = students.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
print(numeric_cols)
# select non-numeric columns
df_non_numeric = students.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values
print(non_numeric_cols)

# function for analysis of nominative variables (boxplot)


def get_boxplot(column):
    fig, ax = plt.subplots(figsize=(14, 4))
    sns.boxplot(x=column, y='score',
                data=students.loc[students.loc[:, column].isin(
                    students.loc[:, column])],
                ax=ax)
    plt.xticks(rotation=45)
    ax.set_title('Boxplot for ' + column)
    plt.show()

# function for analysis of nominative variables (тест Стьюдента)
def get_stat_dif(column):
    cols = students.loc[:, column]
    combinations_all = list(combinations(cols, 2))
    for comb in combinations_all:
        if ttest_ind(students.loc[students.loc[:, column] == comb[0], 'score'],
                     students.loc[students.loc[:, column] == comb[1], 'score']).pvalue \
            <= 0.05/len(combinations_all):  # Учли поправку Бонферони
            print('Найдены статистически значимые различия для колонки', column)
            break

In [None]:
display(students.head(10))
students.info()

In [None]:
# calculate the amount of data with incorrect values
for col in students.columns:
    pct_missing = students[col].isna().mean()
    print(f'{col} - {pct_missing :.1%}')
# there is no data to drop because all values <12%

In [None]:
# impute the missing values and create the missing value indicator variables for each numeric column.
df_numeric = students.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
for col in numeric_cols:
    missing = students[col].isnull()
    num_missing = np.sum(missing)

    if num_missing > 0:  # only do the imputation for the columns that have missing values.
        print('imputing missing values for: {}'.format(col))
        students['{}_ismissing'.format(col)] = missing
        med = students[col].median()
        students[col] = students[col].fillna(med)

In [None]:
# impute the missing values and create the missing value indicator variables for each non-numeric column.
df_non_numeric = students.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values
for col in non_numeric_cols:
    missing = students[col].isnull()
    num_missing = np.sum(missing)

    if num_missing > 0:  # only do the imputation for the columns that have missing values.
        print('imputing missing values for: {}'.format(col))
        students['{}_ismissing'.format(col)] = missing

        top = students[col].describe()['top']  # impute with the most frequent value.
        students[col] = students[col].fillna(top)

In [None]:
# drop all missing value indicator columns
students.drop(['Medu_ismissing', 'Fedu_ismissing', 'traveltime_ismissing', 'studytime_ismissing', 'failures_ismissing', 'studytime_granular_ismissing', 'famrel_ismissing', 'freetime_ismissing', 'goout_ismissing', 'health_ismissing', 'absences_ismissing', 'address_ismissing', 'famsize_ismissing',
               'Pstatus_ismissing', 'Mjob_ismissing', 'Fjob_ismissing', 'reason_ismissing', 'guardian_ismissing', 'schoolsup_ismissing', 'famsup_ismissing', 'paid_ismissing', 'activities_ismissing', 'nursery_ismissing', 'higher_ismissing', 'internet_ismissing', 'romantic_ismissing'], inplace=True, axis=1)
# check for missing values
for col in students.columns:
    pct_missing = students[col].isna().mean()
    print(f'{col} - {pct_missing :.1%}')

In [None]:
# looking for outliers Age
students.age.hist()
students.age.describe()
# I think 22 is a normal age for a student,who studies poorly

In [None]:
# looking for outliers Medu
students.Medu.hist()
students.Medu.describe()
# There is no outliers

In [None]:
# looking for outliers Fedu
students.Fedu.hist()
students.Fedu.describe()
# There is outliers, I think it's just a typo

In [None]:
# fix a typo in Fedu
students.loc[students['Fedu'] == 40, 'Fedu'] = 4
students.Fedu.hist()
# done

In [None]:
# looking for outliers traveltime
students.traveltime.hist()
students.traveltime.describe()
# There is no outliers

In [None]:
# looking for outliers studytime
students.studytime.hist()
students.studytime.describe()
# There is no outliers

In [None]:
# looking for outliers failures
students.failures.hist()
students.failures.describe()
# There is no outliers

In [None]:
# looking for outliers studytime_granular
students.studytime_granular.hist()
students.studytime_granular.describe()
# a more detailed analysis is needed,I will deal with it later

In [None]:
# looking for outliers famrel
students.famrel.hist()
students.famrel.describe()
# There is one outliers, I think it's just a typo

In [None]:
# fix a typo in famrel
students.loc[students['famrel'] == -1, 'famrel'] = 1
students.famrel.hist()
# done

In [None]:
# looking for outliers freetime
students.freetime.hist()
students.freetime.describe()
# There is no outliers

In [None]:
# looking for outliers goout
students.goout.hist()
students.goout.describe()
# There is no outliers

In [None]:
# looking for outliers health
students.health.hist()
students.health.describe()
# There is no outliers

In [None]:
# looking for outliers absences
students.absences.hist()
students.absences.describe()
# insufficient data to determine outliers

In [None]:
# looking for outliers score
students.score.hist()
students.score.describe()
# There is no outliers

In [None]:
# count the unique values in the non-numeric column sex
pd.DataFrame(students.sex.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column address
pd.DataFrame(students.address.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column famsize
pd.DataFrame(students.famsize.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column Pstatus
pd.DataFrame(students.Pstatus.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column Mjob
pd.DataFrame(students.Mjob.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column Fjob
pd.DataFrame(students.Fjob.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column reason
pd.DataFrame(students.reason.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column guardian
pd.DataFrame(students.guardian.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column schoolsup
pd.DataFrame(students.schoolsup.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column famsup
pd.DataFrame(students.famsup.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column paid
pd.DataFrame(students.paid.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column activities
pd.DataFrame(students.activities.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column nursery
pd.DataFrame(students.nursery.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column higher
pd.DataFrame(students.higher.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column internet
pd.DataFrame(students.internet.value_counts())
# all data correct

In [None]:
# count the unique values in the non-numeric column romantic
pd.DataFrame(students.romantic.value_counts())
# all data correct

In [None]:
# make perform correlation analysis depending on the variable "score"
sns.pairplot(students, kind='reg')

In [None]:
students.corr()
# как мы видим, большее влияние на оценки по математике оказывают параметры образования родителей (Medu,Fedu) и
# время на учебу помимо школы (studytime), остальные параметры практически не коррелируют с оценками по математике
# и их можно не учитывать
# колонка studytime_granular так и осталась загадкой, пока нет никаких предположений

In [None]:
# analysis of nominative variables (boxplot)
for col in ['sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian',
            'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet',
            'romantic']:
    get_boxplot(col)
# анализ выявил,что существенное влияние на параметр "score" оказывают следующие колонки:
# address (городские жители учатся чуть лучше чем загородные), Mjob(матери, работающие в медицине оказывают положительный эффект на обучение)
# Fjob (отцы-преподаватели не отдыхают даже дома),schoolsup (дополнительные занятия полезны),higher(ученики, желающие получить
# высшее образование учаться лучше),internet (наличие интернета оказывает положительное влияние на успеваемость)
# остальные данные не оказывают существенного влияния на оценки по математике, ими можно пренебречь

In [None]:
# analysis of nominative variables (тест Стьюдента)
for col in ['sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian',
            'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet',
            'romantic']:
    get_stat_dif(col)
# no statistically significant difference found (I think there is an error in data preparation)

In [None]:
# select parameters for the model
students_for_model = students.loc[:, ['Medu', 'Fedu', 'studytime',
                                      'address', 'Mjob', 'Fjob', 'schoolsup', 'higher', 'internet']]
students_for_model.head()

In [None]:
# на основании промежуточных выводов для построения модели были выбраны следующие данные:
# Medu, Fedu, studytime, address, Mjob, Fjob, schoolsup, higher, internet
# содержание колонки studytime_granular так и осталось нераскрытым
# тест Стьюдента не выявил статистически значимых различий в номинативных данных, скорее всего это говорит об ошибке,
# допущенной при подготовке и очистке данных