# 1. úloha

In [None]:
from IPython.display import Image
Image(filename="IMG_1782.jpg")

In [428]:
import scipy
import openpyxl
import pandas as pd
import numpy as np
from scipy.special import gamma
from itertools import combinations
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt

## 1.2

In [429]:
path = "Data_2024.xlsx"
wb = openpyxl.load_workbook(path)
sheet = wb["Data_věrohodnost"]


data = []

uncensored_data = []
censored_data = []

for row in sheet.iter_rows(min_row=2, max_col=2, values_only=True):
    data.append(row[1])
    
    if(row[0] == 0):
        uncensored_data.append(row[1])
    else:
        censored_data.append(row[1])
    
uncensored_data = np.array(uncensored_data)
censored_data = np.array(censored_data)



In [430]:
def log_likelihood(params, uncensored_data, censored_data):
    k, lambd = params
    
    ll_uncensored = np.sum(np.log(k/lambd) + np.log((uncensored_data/lambd)**(k-1)) - (uncensored_data/lambd)**k)
    
    
    ll_censored = np.sum(-(censored_data / lambd)**k)

    return -(ll_uncensored + ll_censored)

def log_likelihood_exp(params, uncensored_data, censored_data):
    lambd = params

    ll_uncensored = np.sum(np.log(1/lambd) + np.log((uncensored_data/lambd)**(1-1)) - (uncensored_data/lambd)**1)
    
    
    ll_censored = np.sum(-(censored_data / lambd)**1)

    return -(ll_uncensored + ll_censored)



In [None]:
import scipy.optimize

initial_guess = [1.5, np.mean(data)]

result = scipy.optimize.minimize(fun=log_likelihood, x0=initial_guess, args=(uncensored_data, censored_data), method='BFGS')

k_opt, lambd_opt = result.x
print(f"est. params: k = {k_opt:.4f}, lambda = {lambd_opt:.4f}")

## 1.3

In [None]:
initial_guess_exp = [np.mean(data)]

result_exp = scipy.optimize.minimize(fun=log_likelihood_exp, x0=initial_guess_exp, args=(uncensored_data, censored_data), method='BFGS')

L1 = result.fun

L2 = result_exp.fun

Stat = 2 * (L2 - L1)

print(Stat)



Doplnok kritického oboru: <0; 5.024>
592.39 nepatrí do doplnku kritického oboru, teda zamietame H0: Exponenciálne rozdelenie je postačujúce
Prijímame H1: Exponenciálne rozdelenie nie je postačujúce

## 1.4

In [None]:
mean = lambd_opt * gamma(1+1/k_opt)
tencentil = lambd_opt*(-np.log(1-0.1))**(1/k_opt)

print("mean:", mean)
print("tencentil:", tencentil)


# 2. úloha

In [434]:
df = pd.read_excel(path, sheet_name='Data_regrese', usecols='A:E', engine='openpyxl')
df = df.drop("InteractingPct", axis=1)
act_users_mean = df['ActiveUsers'].mean()
act_users_std = df['ActiveUsers'].std()
df["ActiveUsers"] = df["ActiveUsers"].apply(lambda x: (x - act_users_mean) / act_users_std)

scroll_mean = df['ScrollingPct'].mean()
scroll_std = df['ScrollingPct'].std()

df['ScrollingPct'] = df['ScrollingPct'].apply(lambda x: (x - scroll_mean) / scroll_std)

df = pd.get_dummies(df, columns=['OSType'], drop_first=True)

df = df.astype(float)


numeric_predictors = ['ActiveUsers', 'ScrollingPct', 'OSType_iOS', 'OSType_Windows', 'OSType_MacOS']

for col in numeric_predictors:
    df[f'{col}^2'] = df[col] ** 2

for col1, col2 in combinations(numeric_predictors, 2):
    df[f'{col1}:{col2}'] = df[col1] * df[col2]

df = df.drop(columns=['OSType_iOS:OSType_Windows', 'OSType_iOS:OSType_MacOS', 'OSType_Windows:OSType_MacOS', 'OSType_Windows^2', 'OSType_MacOS^2', 'OSType_iOS^2'])



In [None]:
X = df.drop(columns=['Ping [ms]'])
y = df['Ping [ms]']

X = sm.add_constant(X)

full_model = sm.OLS(y, X).fit()
print(full_model.summary())

In [436]:
X = X.drop(columns=['ScrollingPct:OSType_iOS'])
full_model = sm.OLS(y, X).fit()
#print(full_model.summary())

In [437]:
X = X.drop(columns=['ScrollingPct:OSType_Windows'])
full_model = sm.OLS(y, X).fit()
#print(full_model.summary())

In [438]:
X = X.drop(columns=['ScrollingPct:OSType_MacOS'])
full_model = sm.OLS(y, X).fit()
#print(full_model.summary())

In [None]:
X = X.drop(columns=['ScrollingPct^2'])
full_model = sm.OLS(y, X).fit()
print(full_model.summary())

In [None]:
influence = full_model.get_influence()
leverage = influence.hat_matrix_diag
cooks_d = influence.cooks_distance
standardized_residuals = influence.resid_studentized_internal
studentized_residuals = influence.resid_studentized_external
studentized_residuals_pvalues = 2 * (1 - scipy.stats.t.cdf(np.abs(studentized_residuals), df=df.shape[0]-len(full_model.params)))

outl_stats_df = pd.DataFrame({
    'Leverage': leverage,
    'Standardized Residuals': standardized_residuals,
    'Studentized Residuals': studentized_residuals,
    'Studentized Residuals p-value': studentized_residuals_pvalues,
    'Cook\'s Distance': cooks_d[0],
    'Cook\'s Distance_p-value': cooks_d[1]
}, index=df.index)
#vyber jen "zajímavý" hodnoty
outl_stats_df = outl_stats_df[(outl_stats_df['Leverage'] > 3*len(full_model.params)/df.shape[0]) | (np.abs(outl_stats_df['Standardized Residuals']) > 2) | (outl_stats_df['Cook\'s Distance_p-value'] < 0.05)]

summary_frame = influence.summary_frame()

print(outl_stats_df)

df = df.drop(index=[255, 476])

df = df.reset_index(drop=True)

In [441]:
for col in numeric_predictors:
    df[f'{col}^2'] = df[col] ** 2

for col1, col2 in combinations(numeric_predictors, 2):
    df[f'{col1}:{col2}'] = df[col1] * df[col2]

df = df.drop(columns=['OSType_iOS:OSType_Windows', 'OSType_iOS:OSType_MacOS', 'OSType_Windows:OSType_MacOS', 'OSType_Windows^2', 'OSType_MacOS^2', 'OSType_iOS^2'])


In [None]:
X = df.drop(columns=['Ping [ms]'])
y = df['Ping [ms]']

X = sm.add_constant(X)

full_model = sm.OLS(y, X).fit()
print(full_model.summary())



In [443]:
X = X.drop(columns=['ScrollingPct:OSType_iOS'])
full_model = sm.OLS(y, X).fit()
#print(full_model.summary())

In [444]:
X = X.drop(columns=['ScrollingPct:OSType_Windows'])
full_model = sm.OLS(y, X).fit()
#print(full_model.summary())

In [445]:
X = X.drop(columns=['ScrollingPct:OSType_MacOS'])
full_model = sm.OLS(y, X).fit()
#print(full_model.summary())

In [None]:
X = X.drop(columns=['ScrollingPct^2'])
full_model = sm.OLS(y, X).fit()
print(full_model.summary())

In [None]:
vif_data = pd.DataFrame({
    'Variable': X.columns,
    'VIF': [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
})

print(vif_data)

In [None]:
fitted_values = full_model.fittedvalues

residuals = full_model.resid

plt.figure(figsize=(10, 6))
plt.scatter(fitted_values, residuals, alpha=0.7)
plt.axhline(0, color='red', linestyle='--', linewidth=1)

plt.title('Residuals vs Fitted Values')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.show()

In [None]:


sm.qqplot(residuals, line='45', fit=True)
plt.title('Q-Q Plot of Residuals')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=30, edgecolor='black', alpha=0.7)
plt.title('Histogram of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

### 2.1.1 finálna rovnica modelu

Ping = 51.2936 + 10.0457\*ActiveUsers - 5.1410\*ScrollingPct + 9.0073\*OSType_MacOS + 3.6657\*OSType_Windows - 5.7223 \* OSType_iOS - 3.0081 \* ActiveUsers^2 + 2.5596 \* ActiveUsers\*ScrollingPct - 2.7423 \* ActiveUsers \* OSType_iOS - 1.9127 \* ActiveUsers \* OSType_Windows + 4.4373 \* ActiveUsers \* OSType_MacOS

### 2.1.2 Splnenie predpokladov a základné regresné diagnostiky

- Závislosť medzi prediktormi a závislou premennou je lineárna: 
    - Hodnoty v grafe Residuals vs. Fitted values sú náhodne porozhadzované okolo nuly.
- Reziduá sú nezávislé
    - Výsledok Durbin-Watson testu je 1.99, čo neindikuje žiadnu autokoreláciu reziduí
- Homoskedasticita
    - Z grafu Residuals vs. Fitted values je vidieť, že rozloženie hodnôt je všade cca. rovnaké, teda neukazuje žiadnu významnú heteroskedasticitu
- Normálne rozloženie reziduí
    - Viz Histogram reziduí a Q-Q plot
- Žiadna multikolinearita
    - VIF všetkých premenných je < 5

Počas regresného modelovania boli nájdené dve odľahlé hodnoty podľa Studentizovaných reziduí. Tieto boli odstránené, čo zvýšilo kvalitu modelu.
R-squared a Adj. R-squared vyšli pomerne vysoké čísla, čo naznačuje, že model pasuje na dáta. Hodnoty sú tiež podobné, čo naznačuje, že jednotlivé parametre správne prispievajú a v modeli neostali zbytočné premenné.

 



## 2.2 Najproblematickejšie nastavenie parametrov

In [None]:
max_ping = full_model._results.predict().argmax()

worst_params = df.iloc[max_ping]

print(worst_params)

ActUsers = worst_params["ActiveUsers"] * act_users_std + act_users_mean
ScrlPct = worst_params["ScrollingPct"] * scroll_std + scroll_mean
IntPct = 1 - ScrlPct

print("Najhoršie nastavenie parametrov: ActiveUsers: ", ActUsers, " InteractingPct: ", IntPct, " ScrollingPct: ", ScrlPct, "OSType: MacOS")


## 2.3 Odhad odozvy

In [None]:
mean_act_users = df["ActiveUsers"].mean()
mean_scrl_pct = df["ScrollingPct"].mean()

row = X.iloc[23].copy()
row['ActiveUsers'] = mean_act_users
row['ScrollingPct'] = mean_scrl_pct
row['OSType_Windows'] = 1.0
row['OSType_iOS'] = 0.0
row['ActiveUsers^2'] = mean_act_users ** 2
row['ActiveUsers:ScrollingPct'] = mean_act_users * mean_scrl_pct
row['ActiveUsers:OSType_iOS'] = 0.0
row['ActiveUsers:OSType_Windows'] = mean_act_users

prediction = full_model.predict(row)

print("Odhadovaná odozva: ", prediction.iloc[0], "ms")

conf_int = full_model.get_prediction(row).conf_int(0.05)
print ("CI: ",conf_int)

t_stat = scipy.stats.t.ppf(1 - 0.025, df=df.shape[0] - len(full_model.params))

cov_matrix = full_model.cov_params()
X_new = row.values.reshape(1, -1)

se_prediction = np.sqrt(np.dot(np.dot(X_new, cov_matrix), X_new.T))[0, 0]

pi_upper = prediction.iloc[0] + t_stat * se_prediction
pi_lower = prediction.iloc[0] - t_stat * se_prediction
print("PI: ", pi_lower, pi_upper)

## 2.4 Zhodnotenie

p-hodnoty prediktorov sú nízke, R-squared aj adj. R-squared naznačujú že model je schopný, VIF je < 5 pre všetky prediktory - z toho mi vyplýva, že model je použiteľný.