In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats import jarque_bera, normaltest, ttest_1samp
from statsmodels.stats.stattools import durbin_watson

# Load your dataset from a CSV file
csv_file_path = "/content/drive/MyDrive/Dataset/Final Dataset Fifa/final-gk.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Print the column names to verify their existence
print(df.columns)


Index(['Age', 'Potential', 'Value', 'GKDiving', 'GKHandling', 'GKKicking',
       'GKPositioning', 'GKReflexes'],
      dtype='object')


In [None]:
# List of numerical columns to perform tests on
numerical_columns = ['Age', 'Potential', 'Value', 'GKDiving', 'GKHandling', 'GKKicking',
       'GKPositioning', 'GKReflexes']

# Iterate through each numerical column
for col in numerical_columns:
    data = df[col]

    # JB test
    jb_stat, jb_p_value = jarque_bera(data)
    print(f'Jarque-Bera Test for {col}:')
    print(f'  JB Test Statistic: {jb_stat}')
    print(f'  JB P-Value: {jb_p_value}')

    # P-value test
    p_value_stat, p_value = normaltest(data)
    print(f'P-Value Test for {col}:')
    print(f'  P-Value Test Statistic: {p_value_stat}')
    print(f'  P-Value: {p_value}')

    # T-test (one-sample T-test against a known population mean)
    population_mean = 0  # Replace with your known population mean
    t_stat, t_p_value = ttest_1samp(data, population_mean)
    print(f'T-Test for {col}:')
    print(f'  T-Test Statistic: {t_stat}')
    print(f'  T-Test P-Value: {t_p_value}')

    # Standard error
    std_error = np.std(data, ddof=1) / np.sqrt(len(data))
    print(f'Standard Error for {col}: {std_error}')

    # DW test
    # Assuming your data is time series data, you can perform the DW test
    dw_stat = durbin_watson(data)
    print(f'Durbin-Watson Test for {col}:')
    print(f'  Durbin-Watson Statistic: {dw_stat}')


Jarque-Bera Test for Age:
  JB Test Statistic: 35.957394898804104
  JB P-Value: 1.5557897500496247e-08
P-Value Test for Age:
  P-Value Test Statistic: 79.63206820429384
  P-Value: 5.106414894298702e-18
T-Test for Age:
  T-Test Statistic: 177.11989523402542
  T-Test P-Value: 0.0
Standard Error for Age: 0.15840364716058716
Durbin-Watson Test for Age:
  Durbin-Watson Statistic: 0.048702429066626876
Jarque-Bera Test for Potential:
  JB Test Statistic: 14.995170510125233
  JB P-Value: 0.0005544215416448926
P-Value Test for Potential:
  P-Value Test Statistic: 14.777059933955874
  P-Value: 0.0006183042133713634
T-Test for Potential:
  T-Test Statistic: 407.85342524826075
  T-Test P-Value: 0.0
Standard Error for Potential: 0.17699021891372035
Durbin-Watson Test for Potential:
  Durbin-Watson Statistic: 0.00980631926938963
Jarque-Bera Test for Value:
  JB Test Statistic: 306423.83330592926
  JB P-Value: 0.0
P-Value Test for Value:
  P-Value Test Statistic: 1583.6033341258951
  P-Value: 0.0
T-T

In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
data = pd.read_csv('final-gk.csv')

# Extract the predictor variables (independent variables) for which you want to check multicollinearity
# Replace 'X1', 'X2', 'X3', etc. with the actual column names of your predictor variables
X = data[['Age', 'Potential', 'GKDiving', 'GKHandling', 'GKKicking',
       'GKPositioning', 'GKReflexes', 'Value']]

# Add a constant (intercept) term to the predictor variables
X['intercept'] = 1

# Calculate VIF values
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Display the VIF values
print(vif)

        Variable         VIF
0            Age    3.359302
1      Potential    6.387806
2       GKDiving    8.109060
3     GKHandling    6.120214
4      GKKicking    2.116846
5  GKPositioning    7.709050
6     GKReflexes    7.779453
7      intercept  276.006900


In [1]:
import pandas as pd
import numpy as np
from scipy.stats import jarque_bera, normaltest, ttest_1samp
from statsmodels.stats.stattools import durbin_watson

# Load the dataset
df = pd.read_csv('final-gk.csv')

# List of numerical columns to perform tests on
numerical_columns = ['Age', 'Potential', 'Value', 'GKDiving', 'GKHandling', 'GKKicking',
                     'GKPositioning', 'GKReflexes']

# Iterate through each numerical column
for col in numerical_columns:
    data = df[col]

    # Mean
    mean_value = np.mean(data)
    print(f'Mean for {col}: {mean_value}')

    # Standard Deviation
    std_dev = np.std(data, ddof=1)
    print(f'Standard Deviation for {col}: {std_dev}')

Mean for Age: 28.056437389770725
Standard Deviation for Age: 5.334229588413096
Mean for Potential: 72.18606701940035
Standard Deviation for Potential: 5.960130839867328
Mean for Value: 3141962.0811287477
Standard Deviation for Value: 8923320.014371255
Mean for GKDiving: 69.05908289241623
Standard Deviation for GKDiving: 6.575452588211171
Mean for GKHandling: 66.59876543209876
Standard Deviation for GKHandling: 6.384864485855628
Mean for GKKicking: 65.09435626102292
Standard Deviation for GKKicking: 6.8699239548697575
Mean for GKPositioning: 67.69929453262786
Standard Deviation for GKPositioning: 7.202680402049936
Mean for GKReflexes: 70.19488536155202
Standard Deviation for GKReflexes: 6.951472843937273
