In [20]:
import numpy as np
import statsmodels.api as sm

import pandas as pd
#https://stats.oarc.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/#DEVIATION
url = "https://stats.idre.ucla.edu/stat/data/hsb2.csv"
hsb2 = pd.read_table(url, delimiter=",")

hsb2.head(10)
hsb2.groupby("race")["write"].mean()


from patsy.contrasts import Treatment

levels = [1, 2, 3, 4] 
 
# ### Sum (Deviation) Coding

# Sum coding compares the mean of the dependent variable for a given level
# to the overall mean of the dependent variable over all the levels. That
# is, it uses contrasts between each of the first k-1 levels and level k In
# this example, level 1 is compared to all the others, level 2 to all the
# others, and level 3 to all the others.

from patsy.contrasts import Sum

from statsmodels.formula.api import ols
contrast = Sum().code_with_intercept(levels) 
contrast  

ContrastMatrix(array([[ 1.,  1.,  0.,  0.],
                      [ 1.,  0.,  1.,  0.],
                      [ 1.,  0.,  0.,  1.],
                      [ 1., -1., -1., -1.]]),
               ['[mean]', '[S.1]', '[S.2]', '[S.3]'])

In [11]:

mod = ols("write ~ C(race, Sum)", data=hsb2)
res = mod.fit()
print(res.summary())

# This corresponds to a parameterization that forces all the coefficients
# to sum to zero. Notice that the intercept here is the grand mean where the
# grand mean is the mean of means of the dependent variable by each level.

hsb2.groupby("race")["write"].mean().mean()

                            OLS Regression Results                            
Dep. Variable:                  write   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     7.833
Date:                Wed, 13 Apr 2022   Prob (F-statistic):           5.78e-05
Time:                        16:01:03   Log-Likelihood:                -721.77
No. Observations:                 200   AIC:                             1452.
Df Residuals:                     196   BIC:                             1465.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            51.6784      0.98

51.67837643678162

In [9]:
Sum().code_with_intercept(levels)

ContrastMatrix(array([[ 1.,  1.,  0.,  0.],
                      [ 1.,  0.,  1.,  0.],
                      [ 1.,  0.,  0.,  1.],
                      [ 1., -1., -1., -1.]]),
               ['[mean]', '[S.1]', '[S.2]', '[S.3]'])