# Some common statistical tests using the python `pingouin` library

# Import relevant libraries

In [None]:
import pingouin as pg
import pandas as pd
import seaborn as sns
import numpy as np
sns.set()

# Generate some synthetic data

In [None]:

data = {
    'participant': np.tile(np.arange(1, 201), 3),
    'condition': np.repeat(['Condition1', 'Condition2', 'Condition3'], 200)
}

# Define non-uniform distribution for scores
scores = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
score_probabilities = [0.05, 0.05, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1, 0.05, 0.05]

# Generate non-uniform scores
data['score'] = np.random.choice(scores, size=600, p=score_probabilities)

# Define non-uniform distribution for incomes
incomes = [30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000]
income_probabilities = [0.22222222, 0.19444444, 0.16666667, 0.13888889, 0.11111111, 0.08333333, 0.05555556, 0.02777778]

# Generate non-uniform incomes
data['income'] = np.random.choice(incomes, size=600, p=income_probabilities)

# Create a DataFrame
df = pd.DataFrame(data)

# Add age and favorite_color columns
np.random.seed(0)  # For reproducibility
df['age'] = np.random.randint(18, 65, 600)
df['favourite_colour'] = np.random.choice(['Red', 'Blue', 'Green', 'Yellow', 'Purple', 'Orange', 'Pink'], 600)

# Create a gender mapping for participants
participant_ids = np.arange(1, 201)
genders = np.random.choice(['Male', 'Female'], 200)
gender_mapping = dict(zip(participant_ids, genders))

# Assign gender based on the mapping
df['gender'] = df['participant'].map(gender_mapping)





# 1. The _t_ test

Use when assessing whether the means of two groups are significantly different

### The independent samples _t_ test

Use this when the two groups have no members in common

In [None]:
ttest_ind = pg.pairwise_tests(dv='score', between='gender', data=df)
ttest_ind

In [None]:
sns.pointplot(x = 'gender', y = 'income', data = df)

### The related samples _t_ test

Use this when both groups have the same members

In [None]:
df_ = df[(df['condition'] == 'Condition1') | (df['condition'] == 'Condition2')] # removes Condition3 

ttest_rel = pg.pairwise_tests(dv='income', within = 'condition', subject = 'participant', data=df_)
ttest_rel

# 2. The ANOVA (analysis of variance): Comparing the means of multiple groups

Use this to compare the means of multiple groups

### The one-way ANOVA

Use this when comparing the means of multiple groups with no common participants

In [None]:
anova = pg.anova(dv = 'score', between = 'condition', data = df)
anova

In [None]:
sns.pointplot(x = 'condition', y = 'score', data = df)

### The repeat-measures ANOVA

Use this when comparing the means of multiple groups when participants are present in all groups

In [None]:
aov_rm = pg.rm_anova(dv='income', within='condition', subject= 'participant', data=df)
aov_rm

# 3. The linear regression

Used to evaluate the effect of continuous or categorical variables on a continuous variable

### The simple linear regression

Used to evaluate the effect of continuous independent variable on a continuous dependent variable

In [92]:
reg = pg.linear_regression(df['score'], df['income'])
reg

Unnamed: 0,names,coef,se,T,pval,r2,adj_r2,CI[2.5%],CI[97.5%]
0,Intercept,55951.395006,2163.963702,25.855977,1.5957990000000002e-99,0.001537,-0.000133,51701.502517,60201.287495
1,score,-342.103747,356.557405,-0.959463,0.3377132,0.001537,-0.000133,-1042.360703,358.15321


In [None]:
sns.regplot(x = 'age', y = 'income', scatter = False, data = df)

### The multiple linear regression

Used to Used to evaluate the effect of several independent variables on a continuous dependent variable

In [None]:
mul_reg = pg.linear_regression(df[['age', 'score']], df['income'])
mul_reg

# 4. The $\chi^2$ test

This is used to evaluate whether two categoricial variables are related.

In [None]:
stats = pg.chi2_independence(df, x='gender', y='favourite_colour')
stats

In [None]:
sns.heatmap(pd.crosstab(df['gender'], df['favourite_colour']))