# 1. úloha

## 1.1
TODO pridat zvolenu parametrizaciu Weibullovho rozdelenia, log-vierohodnostnu funkciu a parcialne derivacie

In [118]:
import scipy
import openpyxl
import pandas as pd
import numpy as np
from scipy.special import gamma
from itertools import combinations
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt

## 1.2

In [119]:
path = "Data_2024.xlsx"
wb = openpyxl.load_workbook(path)
sheet = wb["Data_věrohodnost"]


data = []

uncensored_data = []
censored_data = []

for row in sheet.iter_rows(min_row=2, max_col=2, values_only=True):
    data.append(row[1])
    
    if(row[0] == 0):
        uncensored_data.append(row[1])
    else:
        censored_data.append(row[1])
    
uncensored_data = np.array(uncensored_data)
censored_data = np.array(censored_data)



In [120]:
def log_likelihood(params, uncensored_data, censored_data):
    k, lambd = params
    
    ll_uncensored = np.sum(np.log(k/lambd) + np.log((uncensored_data/lambd)**(k-1)) - (uncensored_data/lambd)**k)
    
    
    ll_censored = np.sum(-(censored_data / lambd)**k)

    return -(ll_uncensored + ll_censored)

def log_likelihood_exp(params, uncensored_data, censored_data):
    lambd = params

    ll_uncensored = np.sum(np.log(1/lambd) + np.log((uncensored_data/lambd)**(1-1)) - (uncensored_data/lambd)**1)
    
    
    ll_censored = np.sum(-(censored_data / lambd)**1)

    return -(ll_uncensored + ll_censored)



In [None]:
import scipy.optimize

initial_guess = [1.5, np.mean(data)]

result = scipy.optimize.minimize(fun=log_likelihood, x0=initial_guess, args=(uncensored_data, censored_data), method='BFGS')

k_opt, lambd_opt = result.x
print(f"est. params: k = {k_opt:.4f}, lambda = {lambd_opt:.4f}")

## 1.3

In [None]:
initial_guess_exp = [np.mean(data)]

result_exp = scipy.optimize.minimize(fun=log_likelihood_exp, x0=initial_guess_exp, args=(uncensored_data, censored_data), method='BFGS')

L1 = result.fun

L2 = result_exp.fun

Stat = 2 * (L2 - L1)

print(Stat)



Doplnok kritického oboru: <0; 5.024>
592.39 nepatrí do doplnku kritického oboru, teda zamietame H0: Exponenciálne rozdelenie je postačujúce
Prijímame H1: Exponenciálne rozdelenie nie je postačujúce

## 1.4

In [None]:
mean = lambd_opt * gamma(1+1/k_opt)
tencentil = lambd_opt*(-np.log(1-0.1))**(1/k_opt)

print("mean:", mean)
print("tencentil:", tencentil)


# 2. úloha

In [124]:
df = pd.read_excel(path, sheet_name='Data_regrese', usecols='A:E', engine='openpyxl')
df = df.drop("InteractingPct", axis=1)
x_mean = df['ActiveUsers'].mean()
x_std = df['ActiveUsers'].std()
df["ActiveUsers"] = df["ActiveUsers"].apply(lambda x: (x - x_mean) / x_std)

x_mean = df['ScrollingPct'].mean()
x_std = df['ScrollingPct'].std()

df['ScrollingPct'] = df['ScrollingPct'].apply(lambda x: (x - x_mean) / x_std)

df = pd.get_dummies(df, columns=['OSType'], drop_first=True)

df = df.astype(float)


numeric_predictors = ['ActiveUsers', 'ScrollingPct', 'OSType_iOS', 'OSType_Windows', 'OSType_MacOS']

for col in numeric_predictors:
    df[f'{col}^2'] = df[col] ** 2

for col1, col2 in combinations(numeric_predictors, 2):
    df[f'{col1}:{col2}'] = df[col1] * df[col2]

df = df.drop(columns=['OSType_iOS:OSType_Windows', 'OSType_iOS:OSType_MacOS', 'OSType_Windows:OSType_MacOS', 'OSType_Windows^2', 'OSType_MacOS^2', 'OSType_iOS^2'])


In [None]:
X = df.drop(columns=['Ping [ms]'])
y = df['Ping [ms]']

X = sm.add_constant(X)

full_model = sm.OLS(y, X).fit()
print(full_model.summary())

In [126]:
X = X.drop(columns=['ScrollingPct:OSType_iOS'])
full_model = sm.OLS(y, X).fit()
#print(full_model.summary())

In [127]:
X = X.drop(columns=['ScrollingPct:OSType_Windows'])
full_model = sm.OLS(y, X).fit()
#print(full_model.summary())

In [128]:
X = X.drop(columns=['ScrollingPct:OSType_MacOS'])
full_model = sm.OLS(y, X).fit()
#print(full_model.summary())

In [None]:
X = X.drop(columns=['ScrollingPct^2'])
full_model = sm.OLS(y, X).fit()
print(full_model.summary())

In [None]:
vif_data = pd.DataFrame({
    'Variable': X.columns,
    'VIF': [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
})

print(vif_data)

In [None]:
influence = full_model.get_influence()
leverage = influence.hat_matrix_diag
cooks_d = influence.cooks_distance
standardized_residuals = influence.resid_studentized_internal
studentized_residuals = influence.resid_studentized_external
studentized_residuals_pvalues = 2 * (1 - scipy.stats.t.cdf(np.abs(studentized_residuals), df=df.shape[0]-len(full_model.params)))

outl_stats_df = pd.DataFrame({
    'Leverage': leverage,
    'Standardized Residuals': standardized_residuals,
    'Studentized Residuals': studentized_residuals,
    'Studentized Residuals p-value': studentized_residuals_pvalues,
    'Cook\'s Distance': cooks_d[0],
    'Cook\'s Distance_p-value': cooks_d[1]
}, index=df.index)
#vyber jen "zajímavý" hodnoty
outl_stats_df = outl_stats_df[(outl_stats_df['Leverage'] > 3*len(full_model.params)/df.shape[0]) | (np.abs(outl_stats_df['Standardized Residuals']) > 2) | (outl_stats_df['Cook\'s Distance_p-value'] < 0.05)]

summary_frame = influence.summary_frame()

print(outl_stats_df)



In [None]:
df = df.drop(index=[255, 476])

numeric_predictors = ['ActiveUsers', 'ScrollingPct', 'OSType_iOS', 'OSType_Windows', 'OSType_MacOS']

for col in numeric_predictors:
    df[f'{col}^2'] = df[col] ** 2

for col1, col2 in combinations(numeric_predictors, 2):
    df[f'{col1}:{col2}'] = df[col1] * df[col2]

df = df.drop(columns=['OSType_iOS:OSType_Windows', 'OSType_iOS:OSType_MacOS', 'OSType_Windows:OSType_MacOS', 'OSType_Windows^2', 'OSType_MacOS^2', 'OSType_iOS^2'])

full_model = sm.OLS(y, X).fit()
print(full_model.summary())

TODO zapisat rovnicu finalneho modelu