In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from tqdm import tqdm

In [4]:
with open("data.csv", "r", encoding="utf-8") as f:
    content = f.read().replace(";", ",")

with open("data_convert", "w", encoding="utf-8") as f:
    content = f.write(content)

In [5]:
data = pd.read_csv('data_convert')

In [6]:
data

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,Graduate
4420,1,1,2,9773,1,1,120.0,105,1,1,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,Dropout
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,Dropout
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,Graduate


In [7]:
data.duplicated().sum()

np.int64(0)

In [8]:
data.dtypes.unique()

array([dtype('int64'), dtype('float64'), dtype('O')], dtype=object)

In [9]:
for column in data.columns:
    nu = data[column].isnull().sum()
    na = data[column].isna().sum()
    #print(f'Number of Null values: {data[column].isnull().sum()}')
    #print(f'Number of N/A values: {data[column].isna().sum()}')
    if nu > 0 or na > 0:
        if nu > 0:
            print(f'The column {column} has {nu} null values')
        if na > 0:
            print(f'The column {column} has {na} n/a values')

In [10]:
data['Target'].unique()

array(['Dropout', 'Graduate', 'Enrolled'], dtype=object)

In [11]:
def convert(dt):
    if dt == 'Graduate':
        return 1
    elif dt == 'Dropout' or dt == 'Enroll':
        return 0
data['Target'] = data['Target'].apply(convert)

In [None]:
def Standardlization(df):
    df = df.copy()  

    for col in df.columns:
        if col != 'area' and pd.api.types.is_numeric_dtype(df[col]):  
            mean = df[col].mean()
            std = df[col].std()

            df[col] = (df[col] - mean) / std

    return df



In [None]:
class LogisticRegression:
    def __init__(self, epochs: int, lr: float):
        self.weight = None
        self.epochs = epochs
        self.lr = lr
        self.losses = []
        self.metrics = []

    def sigmoid(self, z):
        return 1 / (1 + math.exp(-z))
    
    def loss_func(self, y, y_hat):
        return -(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)).mean()
    
    def accuracy(self, y, y_hat):
        return (1 - np.abs(y - y_hat)).mean()
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weight = np.zeros(n_features)
        
        with tqdm(range(self.epochs)) as tq:
            for e in tq:
                tq.set_description(f'epochs{e + 1}')

        
                y_hat = self.predict(X)

                # Updating weights
                d_dw = 1/n_samples * (X.T @ (y - y_hat))
                self.weight -= self.lr * d_dw
                
                loss = self.loss_func(y, y_hat)
                metric = self.accuracy(y, y_hat)

                tq.set_postfix({
                    "loss": loss,
                    "metric": metric
                })

                self.losses.append(loss)
                self.metrics.append(metric)

            
    def predict(self, X):
        z = X @ self.weight
        pred = self.sigmoid(z)

        return (pred >= 0.5).astype(int)

In [None]:
data

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,0.0
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,1.0
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,0.0
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,1.0
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,1.0
4420,1,1,2,9773,1,1,120.0,105,1,1,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,0.0
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,0.0
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,1.0


In [None]:
X_train, X_test  = np.array_split(data, [int(0.8 * len(data))])

  return bound(*args, **kwds)


In [None]:
X_train

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,0.0
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,1.0
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,0.0
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,1.0
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3534,1,39,1,9991,0,1,120.0,1,34,34,...,0,5,5,0,0.000000,0,7.6,2.6,0.32,0.0
3535,1,51,1,9991,0,39,120.0,1,1,4,...,9,13,13,9,11.333333,0,8.9,1.4,3.51,0.0
3536,1,39,1,9991,0,1,110.0,1,37,37,...,0,5,0,0,0.000000,0,12.7,3.7,-1.70,0.0
3537,1,39,2,9147,1,19,133.1,1,38,19,...,0,5,9,0,0.000000,0,10.8,1.4,1.74,0.0


In [None]:
y_train = X_train['Target']
X_train = X_train.drop(columns = 'Target')
y_test = X_test['Target']
X_test = X_test.drop(columns = 'Target')

In [None]:
model = LogisticRegression(lr=0.01, epochs=200)

In [None]:
model.fit()

TypeError: LogisticRegression.fit() missing 2 required positional arguments: 'X' and 'y'