In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import plotly.express as px

pd.options.display.max_rows = 120
pd.options.display.max_columns = 120

### Анализ и подготовка данных 

In [2]:
data_fd = pd.read_csv('gladiator_data.csv')
target = data_fd['Survived'].replace({False: 0, True: 1})
data_fd = data_fd.drop(["Name", "Survived"], axis=1)

data_fd.info()

FileNotFoundError: [Errno 2] No such file or directory: 'gladiator_data.csv'

In [None]:
for f in data_fd.columns:
    series = data_fd[f].value_counts(dropna=False, normalize=True)
    print(series)
    if len(series) < 40:
        series.plot(kind='bar')
        plt.show()

In [None]:
data_types = data_fd.dtypes
data_types = data_types[data_types == "object"].index
categorial_features = list(data_types)
categorial_features

## Задание 1-2 :

In [None]:
data_fd_encoding = data_fd.copy()
data_fd_label = data_fd.copy()
data_fd_target = data_fd.copy()

#### LabelEncoding

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif, SelectKBest
label_encoder = LabelEncoder()

for column in categorial_features:
  data_fd_label[column] = label_encoder.fit_transform(data_fd_label[column])

print(data_fd_label[categorial_features].info())

#### OneHotEncoder

In [None]:
from sklearn.preprocessing import OneHotEncoder 
oneHotEncoder = OneHotEncoder(sparse=False)

list_to_encode = categorial_features
print(data_fd_encoding.columns)


for category in list_to_encode:
    transorm = oneHotEncoder.fit_transform(data_fd_encoding[[category]])
    encoded_df = pd.DataFrame(transorm, columns=[category for category in oneHotEncoder.get_feature_names_out([category])]).fillna(0.0)
    data_fd_encoding = data_fd_encoding.join(encoded_df)
    data_fd_encoding = data_fd_encoding.drop([category], axis=1)


## Target encoder

In [None]:
from sklearn.preprocessing import TargetEncoder
list_to_encode = categorial_features

target_encoder = TargetEncoder()

data_fd_target = target_encoder.fit_transform(data_fd_target, target)

##### Разделение на тестовую и тренировочную часть

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_fd, target, test_size=0.2, random_state=1)
X_train_encoded, X_test_encoded, y_train, y_test = train_test_split(data_fd_encoding, target, test_size=0.2, random_state=1)
X_train_target, X_test_target, y_train, y_test = train_test_split(data_fd_target, target, test_size=0.2, random_state=1)

##### Обучение моделей без категориальных данных

In [None]:
X_train_linear = X_train.drop(categorial_features, axis=1)
X_test_linear = X_test.drop(categorial_features, axis=1)

In [None]:

linear = LogisticRegression(max_iter=10000)
linear.fit(X_train_linear, y_train)
predict = linear.predict(X_test_linear)
print(metrics.accuracy_score(predict, y_test))
#0.7464444052767872 9s

In [None]:
rand_forest = RandomForestClassifier()
rand_forest.fit(X_train_linear, y_train)
predict = rand_forest.predict(X_test_linear)
print(metrics.accuracy_score(predict, y_test))
#0.827216399799341 12m30s


In [None]:
catboost = CatBoostClassifier()
catboost.fit(X_train_linear, y_train)
predict = catboost.predict(X_test_linear)
print(metrics.accuracy_score(predict, y_test))
#0.8327616361836843     1m35s

##### CatBoostClassifier на первоначальных данных

In [None]:
catboost = CatBoostClassifier()
catboost.fit(X_train, y_train, cat_features = list(categorial_features))
predict = catboost.predict(X_test)
print(metrics.accuracy_score(predict, y_test))
# 0.850251501552395     21m30s


##### Обучение моделей на данных, обработанных OneHotEncoder

In [None]:
linear2 = LogisticRegression(max_iter=10000)
print(X_train_encoded)
linear2.fit(X_train_encoded, y_train)
predict = linear2.predict(X_test_encoded)
print(metrics.accuracy_score(predict, y_test))
# 0.7920332985343764    4m20s

In [None]:
rand_forest = RandomForestClassifier()
rand_forest.fit(X_train_encoded, y_train)
predict = rand_forest.predict(X_test_encoded)
print(metrics.accuracy_score(predict, y_test))
# 0.8407676559512995    9m30s

In [None]:
catboost = CatBoostClassifier()
catboost.fit(X_train_encoded, y_train)
predict = catboost.predict(X_test_encoded)
print(metrics.accuracy_score(predict, y_test))
# 0.8504209769920144 1m40s

##### Обучение моделей на данных, обработанных TargetEncoder

In [None]:
linear2 = LogisticRegression(max_iter=10000)
linear2.fit(X_train_target, y_train)
predict = linear2.predict(X_test_target)
print(metrics.accuracy_score(predict, y_test))
# 0.7671949781037732 12s

In [None]:
rand_forest = RandomForestClassifier()
rand_forest.fit(X_train_target, y_train)
predict = rand_forest.predict(X_test_target)
print(metrics.accuracy_score(predict, y_test))
# 0.7933823230337459 20m40s

## Задание 3

In [None]:
mi_scores = mutual_info_classif(data_fd_label, target)

In [None]:

mi_df = pd.DataFrame({'Feature': data_fd_label.columns, 'Mi score': mi_scores})
mi_df.sort_values(by='Mi score', ascending=False)

mi_df[mi_df['Mi score'] > 0.03].sort_values(by='Mi score', ascending=False)

## Задание 4

In [None]:
X_norm = (data_fd_label - data_fd_label.mean(axis=0))/data_fd_label.std(axis=0)

pca = PCA(n_components=2)
components = pca.fit_transform(X_norm)

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter(
    components, x=0, y=1, color=target,
    title=f'Total Explained Variance: {total_var:.2f}%'
)
fig.show()

In [None]:
random_indices = np.random.choice(len(components), size=1000, replace=False)
random_components = components[random_indices]
random_target = target[random_indices]

fig = px.scatter(
    random_components, x=0, y=1,color=random_target,
    title=f'Total Explained Variance: {total_var:.2f}%'
)
fig.show()

In [None]:
pca.explained_variance_ratio_

## Задание 5
##### Собственная реализация сингулярного разложения через степенной метод

In [None]:
# from numba import njit

# @cuda.njit
def svd_power_iteration(A, k, epsilon=0.00001):
    #http://mlwiki.org/index.php/Power_Iteration
    n_orig, m_orig = A.shape
    if k is None:
        k=min(n_orig,m_orig)
    A_orig=A.copy()
    if n_orig > m_orig:
        A = A.T @ A
        n, m = A.shape
    elif n_orig < m_orig:
        A = A @ A.T
        n, m = A.shape
    else:
        n,m=n_orig, m_orig
        
    Q = np.random.rand(n, k)
    Q, _ = np.linalg.qr(Q)
    Q_prev = Q

    for i in range(k):
        Z = A @ Q
        Q, R = np.linalg.qr(Z)
        err = ((Q - Q_prev) ** 2).sum()
        Q_prev = Q
        if err < epsilon:
            break
            
    singular_values=np.sqrt(np.absolute(np.diag(R))) 
    if n_orig < m_orig: 
        left_vecs=Q.T
        right_vecs=np.linalg.inv(np.diag(singular_values))@left_vecs.T@A_orig
    elif n_orig==m_orig:
        left_vecs=Q.T
        right_vecs=left_vecs
        singular_values=np.square(singular_values)
    else:
        right_vecs=Q.T
        left_vecs=A_orig@ right_vecs.T @np.linalg.inv(np.diag(singular_values))

    return left_vecs, singular_values, right_vecs

##### Нахождение собственных значений и векторов

In [None]:
cov_matrix = np.cov(X_norm.T)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
eig_pairs = [(eigenvalues[index],eigenvectors[:,index]) for index in range(len(eigenvalues))]

eig_pairs.sort()

eig_pairs.reverse()

eigenvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigenvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]

print("Sorted eigan values:", eigenvalues_sorted)

#### Получение собственных векторов и значений через: np.linalg.svd, svd_power_iteration, которые равны значениям из предыдущей ячейки

In [None]:
u, s, v = np.linalg.svd(cov_matrix, full_matrices=False)
left_s, values, rigth_s = svd_power_iteration(cov_matrix, k=10000)
print('++++')
print(((s)))
print('++++')
print((values))


In [None]:
total = sum(eigenvalues_sorted)
var_explained = [(i/total) for i in eigenvalues_sorted]

cum_var_exp = np.cumsum(var_explained)
vect = np.array(eigenvectors_sorted) 

X_vect_pca = np.dot(X_norm,vect.T)

X_pca = X_vect_pca[:, :2]
total_var = (cum_var_exp[1]) * 100

fig = px.scatter(
    X_pca, x=0, y=1, color=target,
    title=f'Total Explained Variance: {total_var:.2f}%'
)
fig.show()

In [None]:

X_norm = (data_fd_encoding - data_fd_encoding.mean(axis=0))/data_fd_encoding.std(axis=0)

pca = PCA(n_components=2)
components = pca.fit_transform(X_norm)

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter(
    components, x=0, y=1, 
    title=f'Total Explained Variance: {total_var:.2f}%'
)
fig.show()

In [None]:
import pandas as pd
import numpy as np

lst = [['CatBoostClassifier', 0.850251501552395, 0.8327616361836843, 0.8504209769920144, np.nan], ['Logistic', np.nan, 0.7464444052767872,0.7920332985343764 , 0.7671949781037732],
       ['RandForest', np.nan, 0.8264978239353553, 0.8407676559512995, 0.7933823230337459]]
   
df = pd.DataFrame(lst, columns =['Method', 'Source data', 'Source wothout categorial', 'OneHotEncoding', 'TargetEncoding'])
df