In [99]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
import numpy as np


from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor, CatBoostClassifier


In [100]:
def fill_missing_values(df: pd.DataFrame, strategy: str ='mode') -> pd.DataFrame:
    """Fill missing values using specified strategy or predictions from XGBRegressor"""
    from xgboost import XGBRegressor
   
    df = df.copy()
    for column in df.columns:
        if df[column].dtype == 'object':
            fill_value = df[column].mode()[0]
        else:
            if strategy == 'mean':
                fill_value = df[column].mean()
            elif strategy == 'median':
                fill_value = df[column].median()
            elif strategy == 'mode':
                fill_value = df[column].mode()[0]
            elif strategy == 'xgb':
                filled_data = df.dropna(subset=[column])
                missing_data = df[df[column].isnull()]
               
                xgb = XGBRegressor()
                xgb.fit(filled_data.drop(columns=[column]), filled_data[column])
               
                predicted_values = xgb.predict(missing_data.drop(columns=[column]))
               
                df.loc[missing_data.index, column] = predicted_values
            else:
                raise ValueError("Invalid strategy. Please choose from 'mean', 'median', 'mode', or 'xgb'.")
            df[column].fillna(value=fill_value, inplace=True)
    return df








def one_hot_encoder(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    """Категорийные данные в численные"""
    df = df.copy()
    df_num = df.drop(columns = columns)
    df = df[columns].copy()
    df_prep = pd.DataFrame()
    for column in df.columns:
        unique_values = df[column].unique()
        if len(unique_values) <= 2:
            df_prep[column] = df[column]
            for i in range(len(unique_values)):
                df_prep[column] = df_prep[column].replace({unique_values[i]: i})
        else:
            dummies = pd.get_dummies( df[column], prefix=column )
            df_prep = pd.concat([df_prep, dummies], axis=1)
    return pd.concat([df_num, df_prep], axis=1)


In [101]:
df = pd.read_csv('student_data.csv')
df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


In [102]:
df['G2'] = (df['G1'] + df['G2'])/2
# df['Fedu'] = df['Fedu'].apply(lambda x: 0 if x < 3 else 1)
# df['Medu'] = df['Medu'].apply(lambda x: 0 if x < 3 else 1)
# df['failures'] = df['failures'].apply(lambda x: 1 if x != 0 else 0)
# studytime_mapping = {1: 'malo', 2: 'norm', 3: 'a lot', 4: 'a lot'} 
# mapping = {1: 'malo', 2: 'malo', 3: 'norm', 4: 'a lot', 5: 'a lot'}
df['Alcohol'] = (df['Dalc']+ df['Walc'])/2
# df['Walc'] = df['Walc'].map(mapping)
# df['studytime'] = df['studytime'].map(studytime_mapping)


def label_absences(value):
    return "not much" if value < 10 else "a lot"

# Apply the function to the 'absences' column
df['absences'] = df['absences'].apply(label_absences)
df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,Alcohol
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,not much,5,5.5,6,1.0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,not much,5,5.0,6,1.0
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,a lot,7,7.5,10,2.5
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,not much,15,14.5,15,1.0
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,not much,6,8.0,10,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,4,4,5,4,a lot,9,9.0,9,4.5
391,MS,M,17,U,LE3,T,3,1,services,services,...,4,5,3,4,2,not much,14,15.0,16,3.5
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,3,3,3,3,not much,10,9.0,7,3.0
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,1,3,4,5,not much,11,11.5,10,3.5


In [103]:

categorical_cols = []
numerical_cols = []




for column in df.columns:
    if df[column].dtype == 'object':
        categorical_cols.append(column)
    else:
        numerical_cols.append(column)


drop_list = ['G1','famsize', 'Pstatus', 'guardian', 'Mjob', 'Fjob', 'traveltime', 'Dalc', 'Walc', 'famrel',  'school', 'reason', 'school', 'age', 'paid', 'higher', 'nursery', 'famsup', 'Medu', 'Fedu', 'schoolsup', 'romantic', 'address']



categorical_cols = [i for i in categorical_cols if i not in drop_list]
numerical_cols = [i for i in numerical_cols if i not in drop_list]


df = df.drop(drop_list, axis=1)


print(numerical_cols)
print(categorical_cols)


print(len(categorical_cols)+ len(numerical_cols))

['studytime', 'failures', 'freetime', 'goout', 'health', 'G2', 'G3', 'Alcohol']
['sex', 'activities', 'internet', 'absences']
12


In [104]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 100))

# Fit and transform the data
df[['G2', 'G3']] = scaler.fit_transform(df[['G2', 'G3']])

In [105]:
df

Unnamed: 0,sex,studytime,failures,activities,internet,freetime,goout,health,absences,G2,G3,Alcohol
0,F,2,0,no,no,3,4,3,not much,20.588235,30.0,1.0
1,F,2,0,no,yes,3,3,3,not much,17.647059,30.0,1.0
2,F,2,3,no,yes,3,2,3,a lot,32.352941,50.0,2.5
3,F,3,0,yes,yes,2,2,5,not much,73.529412,75.0,1.0
4,F,2,0,no,no,3,2,5,not much,35.294118,50.0,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...
390,M,2,2,no,no,5,4,4,a lot,41.176471,45.0,4.5
391,M,1,0,no,yes,4,5,2,not much,76.470588,80.0,3.5
392,M,1,3,no,no,5,3,3,not much,41.176471,35.0,3.0
393,M,1,0,no,yes,4,1,5,not much,55.882353,50.0,3.5


In [106]:
df['failures'].value_counts()

failures
0    312
1     50
2     17
3     16
Name: count, dtype: int64

In [107]:
df = one_hot_encoder( df, categorical_cols )

  df_prep[column] = df_prep[column].replace({unique_values[i]: i})
  df_prep[column] = df_prep[column].replace({unique_values[i]: i})
  df_prep[column] = df_prep[column].replace({unique_values[i]: i})
  df_prep[column] = df_prep[column].replace({unique_values[i]: i})


In [108]:
df


Unnamed: 0,studytime,failures,freetime,goout,health,G2,G3,Alcohol,sex,activities,internet,absences
0,2,0,3,4,3,20.588235,30.0,1.0,0,0,0,0
1,2,0,3,3,3,17.647059,30.0,1.0,0,0,1,0
2,2,3,3,2,3,32.352941,50.0,2.5,0,0,1,1
3,3,0,2,2,5,73.529412,75.0,1.0,0,1,1,0
4,2,0,3,2,5,35.294118,50.0,1.5,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
390,2,2,5,4,4,41.176471,45.0,4.5,1,0,0,1
391,1,0,4,5,2,76.470588,80.0,3.5,1,0,1,0
392,1,3,5,3,3,41.176471,35.0,3.0,1,0,0,0
393,1,0,4,1,5,55.882353,50.0,3.5,1,0,1,0


In [109]:
# хотел сделать оценки А Б и С в зависимости от грейд

In [110]:
X = df.copy()
y = X.pop('G3')


X_train, X_test, y_train, y_test = tts(X,y, test_size = 0.2)

In [111]:
cat = CatBoostRegressor()
cat.fit(X_train, y_train)
cat.score(X_test, y_test)

Learning rate set to 0.03413
0:	learn: 22.6707045	total: 427us	remaining: 427ms
1:	learn: 22.2686328	total: 745us	remaining: 372ms
2:	learn: 21.8895108	total: 1ms	remaining: 333ms
3:	learn: 21.5259239	total: 1.15ms	remaining: 287ms
4:	learn: 21.1637709	total: 1.34ms	remaining: 266ms
5:	learn: 20.8336537	total: 1.48ms	remaining: 245ms
6:	learn: 20.4563522	total: 1.62ms	remaining: 230ms
7:	learn: 20.0532443	total: 1.81ms	remaining: 225ms
8:	learn: 19.7358930	total: 1.96ms	remaining: 215ms
9:	learn: 19.3521840	total: 2.03ms	remaining: 201ms
10:	learn: 19.0130671	total: 2.25ms	remaining: 203ms
11:	learn: 18.7036368	total: 2.52ms	remaining: 207ms
12:	learn: 18.4274003	total: 2.75ms	remaining: 209ms
13:	learn: 18.1846015	total: 3.26ms	remaining: 229ms
14:	learn: 17.8953015	total: 3.47ms	remaining: 228ms
15:	learn: 17.6183953	total: 3.61ms	remaining: 222ms
16:	learn: 17.2973806	total: 3.83ms	remaining: 222ms
17:	learn: 17.0362826	total: 4.01ms	remaining: 219ms
18:	learn: 16.7863387	total: 4.1

0.6567850596428012

In [112]:
pred = cat.predict(X_test.head(1))

In [113]:
grades = np.where(pred < 60, 'Congrats! You got an F',
         np.where(pred < 70, 'Ok. C',
         np.where(pred < 85, 'Harosh! B','UAYAAA KRASAVAAA! A')))

# Print the categorized grades
print(grades)

['Harosh! B']


In [114]:
importances = cat.feature_importances_

# Create a DataFrame for better visualization
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)
print(feature_importances)


       feature  importance
5           G2   62.814229
4       health    6.262428
1     failures    5.447908
2     freetime    5.343862
6      Alcohol    5.204972
3        goout    4.694200
0    studytime    4.123130
8   activities    2.487583
10    absences    1.713346
9     internet    1.035549
7          sex    0.872795


In [115]:
import pandas as pd
import numpy as np

# Define the column names and their corresponding input ranges
columns = {
    'studytime': (1, 4),
    'failures': (0, 3),
    'freetime': (1, 5),
    'goout': (1, 5),
    'health': (1, 5),
    'G2': (0.0, 100.0),   # Assuming these are already scaled to [0, 100]
    'Alcohol': (0.0, 5.0), # Assuming a scale for Alcohol usage
    'sex': (0, 1),
    # 'address': (0, 1),
    'activities': (0, 1),
    'internet': (0, 1),
    'absences': (0, 1)  # Assuming absences are on a scale of 0 to 100
}

# Initialize a dictionary to store the data
data = {col: [] for col in columns}

# Number of rows you want to input
num_rows = int(input("Enter the number of rows you want to input: "))

# Function to validate input
def get_valid_input(col, min_val, max_val, is_float=False):
    while True:
        try:
            if is_float:
                value = float(input(f"Enter value for {col} ({min_val} to {max_val}): "))
            else:
                value = int(input(f"Enter value for {col} ({min_val} to {max_val}): "))
            
            if min_val <= value <= max_val:
                return value
            else:
                print(f"Value for {col} must be between {min_val} and {max_val}.")
        except ValueError:
            print(f"Invalid input for {col}. Please enter a {'float' if is_float else 'integer'} value.")

# Collecting data from user inputs
for i in range(num_rows):
    print(f"\nEntering data for row {i+1}:")
    for col, (min_val4, max_val) in columns.items():
        is_float = col in ['G2', 'G3', 'Alcohol']
        value = get_valid_input(col, min_val, max_val, is_float)
        data[col].append(value)

# Create the DataFrame
dt = pd.DataFrame(data)

# Display the DataFrame
print("\nCreated DataFrame:")
print(dt)


Entering data for row 1:

Created DataFrame:
   studytime  failures  freetime  goout  health    G2  Alcohol  sex  \
0          0         3         5      5       0  60.0      5.0    0   

   activities  internet  absences  
0           0         0         1  


In [116]:
pred = cat.predict(dt.head(1))
pred

array([57.47082377])

In [117]:
grades = np.where(pred < 60, 'Congrats! You got an F',
         np.where(pred < 70, 'Ok. C',
         np.where(pred < 85, 'Harosh! B','UAYAAA KRASAVAAA! A')))

# Print the categorized grades
print(grades)

['Congrats! You got an F']
