In this notebook I will create a synthesized version of my original dataset using the Gaussian Copula model.

In mathematical terms, a *copula* is a distribution over the unit cube
${\displaystyle [0,1]^{d}}$ which is constructed from a multivariate
normal distribution over ${\displaystyle \mathbb {R} ^{d}}$ by using the
probability integral transform. Intuitively, a *copula* is a
mathematical function that allows us to describe the joint distribution
of multiple random variables by analyzing the dependencies between their
marginal distributions.

In [101]:
import pandas as pd
import csv
from sdv.tabular import GaussianCopula
import warnings
warnings.filterwarnings('ignore', 'The iteration is not making good progress')

In [102]:
df = pd.read_csv("compas-python-master/compas-scores-two-years-copy.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 53 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       199 non-null    float64
 1   name                     199 non-null    object 
 2   first                    199 non-null    object 
 3   last                     199 non-null    object 
 4   compas_screening_date    199 non-null    object 
 5   sex                      199 non-null    object 
 6   dob                      199 non-null    object 
 7   age                      199 non-null    float64
 8   age_cat                  199 non-null    object 
 9   race                     199 non-null    object 
 10  juv_fel_count            199 non-null    float64
 11  decile_score             199 non-null    float64
 12  juv_misd_count           199 non-null    float64
 13  juv_other_count          199 non-null    float64
 14  priors_count            

In [103]:
df_clean = df.drop(df.index[300:])
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300 entries, 0 to 299
Data columns (total 53 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       199 non-null    float64
 1   name                     199 non-null    object 
 2   first                    199 non-null    object 
 3   last                     199 non-null    object 
 4   compas_screening_date    199 non-null    object 
 5   sex                      199 non-null    object 
 6   dob                      199 non-null    object 
 7   age                      199 non-null    float64
 8   age_cat                  199 non-null    object 
 9   race                     199 non-null    object 
 10  juv_fel_count            199 non-null    float64
 11  decile_score             199 non-null    float64
 12  juv_misd_count           199 non-null    float64
 13  juv_other_count          199 non-null    float64
 14  priors_count             1

In [104]:
df_clean.drop(['vr_case_number', 'vr_charge_degree', 'vr_offense_date',
              'vr_charge_desc', 'r_case_number', 'r_charge_degree',
              'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 
              'r_jail_in', 'r_jail_out', 'violent_recid', 'c_arrest_date'], axis = 1, inplace = True)
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300 entries, 0 to 299
Data columns (total 40 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       199 non-null    float64
 1   name                     199 non-null    object 
 2   first                    199 non-null    object 
 3   last                     199 non-null    object 
 4   compas_screening_date    199 non-null    object 
 5   sex                      199 non-null    object 
 6   dob                      199 non-null    object 
 7   age                      199 non-null    float64
 8   age_cat                  199 non-null    object 
 9   race                     199 non-null    object 
 10  juv_fel_count            199 non-null    float64
 11  decile_score             199 non-null    float64
 12  juv_misd_count           199 non-null    float64
 13  juv_other_count          199 non-null    float64
 14  priors_count             1

In [105]:
model = GaussianCopula()
model.fit(df_clean)

  return c**2 / (c**2 - n**2)
  Lhat = muhat - Shat*mu
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  return _boost._beta_cdf(x, a, b)
  return cd2*x**(c-1)
  a = (self.min - loc) / scale
  b = (self.max - loc) / scale


In [109]:
df_new = model.sample(7000)
df_new.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,140.0,matthew grant,michael,hernandez,9/5/13,Male,7/24/85,49.0,25 - 45,African-American,...,1.0,Low,9/5/13,,,0.0,0.0,993.0,0.0,0.0
1,96.0,maslin brutus,michael,wilson,11/15/14,Male,3/16/84,44.0,,African-American,...,3.0,Low,11/15/14,,,5.0,32.0,708.0,1.0,1.0
2,181.0,kortney coleman,matthew,jones,1/1/13,,8/22/78,32.0,25 - 45,African-American,...,3.0,Low,1/1/13,,,0.0,0.0,1005.0,0.0,0.0
3,48.0,jimmy bell,jimmy,bell,1/27/14,Male,4/2/57,31.0,25 - 45,Other,...,2.0,Low,1/27/14,,,2.0,0.0,235.0,0.0,0.0
4,228.0,neil heckart,neil,heckart,2/26/13,,12/24/84,37.0,Less than 25,Hispanic,...,3.0,Medium,2/26/13,,,5.0,49.0,718.0,1.0,1.0


In [110]:
model.save('synth_model.pkl')

In [111]:
loaded = GaussianCopula.load('synth_model.pkl')
df_new = loaded.sample(7000)

  return _boost._beta_ppf(q, a, b)


In [112]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 40 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       7000 non-null   float64
 1   name                     7000 non-null   object 
 2   first                    7000 non-null   object 
 3   last                     7000 non-null   object 
 4   compas_screening_date    7000 non-null   object 
 5   sex                      5972 non-null   object 
 6   dob                      7000 non-null   object 
 7   age                      7000 non-null   float64
 8   age_cat                  6681 non-null   object 
 9   race                     6938 non-null   object 
 10  juv_fel_count            7000 non-null   float64
 11  decile_score             7000 non-null   float64
 12  juv_misd_count           7000 non-null   float64
 13  juv_other_count          7000 non-null   float64
 14  priors_count            