In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import random
import math

In [None]:
model_attribute_names = ['online class', 'Timestamp', 'reputation', 'Quality']
COVID_start_timestamp = 1584247043
def get_df_from_csv(csv_name):
    return pd.read_csv(csv_name, quotechar='"', skipinitialspace=True)
def join_df_on_sid(student_rating_csv_name, university_rating_csv_name):
    student_rating_df = get_df_from_csv(student_rating_csv_name)
    university_rating_df = get_df_from_csv(university_rating_csv_name)
    return student_rating_df.merge(university_rating_df, on='sid')
def reduce_df_attributes(joined_df):
    return joined_df[model_attribute_names]
def convert_is_online(df):
    df['online class'] = np.where(df['online class'] == 'False', 0, 1)
    return df
def convert_timestamp_to_during_COVID(df):
    df['Timestamp'] = np.where(df['Timestamp'] < COVID_start_timestamp, 0, 1)
    df = df.rename(columns={'Timestamp': 'during COVID'})
    return df

In [None]:
df = join_df_on_sid('cleaned_ratings.csv', 'school_ratings.csv')
df = reduce_df_attributes(df)
df = convert_timestamp_to_during_COVID(df)
df = convert_is_online(df)

X, y, z = [], [], []
for idx, row in df.iterrows():
  X.append([row['during COVID'], row['reputation']])
  y.append(row['online class'])
  z.append(row['Quality'])

grouped = list(zip(X, y, z))
random.shuffle(grouped)
X = np.array([group[0] for group in grouped])
y = np.array([group[1] for group in grouped])
z = np.array([group[2] for group in grouped])

In [None]:
train_X = X[:math.floor(len(y) * .9)]
train_y = y[:math.floor(len(y) * .9)]
test_X = X[math.floor(len(y) * .9):]
test_y = y[math.floor(len(y) * .9):]

In [None]:
clf = LinearRegression().fit(X, y)

In [None]:
clf = LogisticRegression(random_state=0).fit(train_X, train_y)

$$p(O | \text{C}, \text{R}) = \theta_0 + \theta_1O + \theta_2R$$

In [None]:
propensities = clf.predict_proba(X)[:, 0]
propensities

array([0.97949811, 0.9629896 , 0.96643479, ..., 0.94551086, 0.9629896 ,
       0.9629896 ])

In [None]:
df['propensity'] = propensities

df

Unnamed: 0,online class,during COVID,reputation,Quality,propensity
0,0,0,4.0,5.0,0.979498
1,0,0,4.0,4.0,0.962990
2,0,0,4.0,3.5,0.966435
3,0,0,4.0,3.5,0.962990
4,0,0,4.0,4.0,0.966435
...,...,...,...,...,...
510359,0,0,3.6,5.0,0.984788
510360,0,0,3.6,5.0,0.988729
510361,0,0,3.6,5.0,0.945511
510362,0,0,3.6,1.0,0.962990


In [None]:
groups = {}
mean = z.mean()
for i, propensity in enumerate(propensities):
  if propensity not in groups:
    # [#in-person, #online, #in-person & quality, #online & quality, #total]
    groups[propensity] = [0, 0, 0, 0, 0]
  yi = math.floor(y[i])
  groups[propensity][yi] += 1
  if z[i] > mean:
    groups[propensity][yi + 2] += 1
  groups[propensity][4] += 1 # sum

ps = list(groups.keys())
for propensity in ps:
  group = groups[propensity]
  if group[0] == 0 or group[1] == 0:
    del groups[propensity]

In [None]:
ip_numerator = 0
ip_denominator = 0
o_numerator = 0
o_denominator = 0
for propensity in groups:
  num_ip, num_o, num_ipq, num_oq, num_total = groups[propensity]
  ip_numerator += num_total / num_ip * num_ipq / num_ip
  o_numerator += num_total / num_o * num_oq / num_o
  ip_denominator += num_total / num_ip
  o_denominator += num_total / num_o
print(ip_numerator / ip_denominator, o_numerator / o_denominator)

0.6239724019784485 0.511973478070377


In [None]:
dfNotOnline = df[df['online class'] == 0]

dfNotOnline

Unnamed: 0,online class,during COVID,reputation,Quality,propensity
0,0,0,4.0,5.0,0.979498
1,0,0,4.0,4.0,0.962990
2,0,0,4.0,3.5,0.966435
3,0,0,4.0,3.5,0.962990
4,0,0,4.0,4.0,0.966435
...,...,...,...,...,...
510359,0,0,3.6,5.0,0.984788
510360,0,0,3.6,5.0,0.988729
510361,0,0,3.6,5.0,0.945511
510362,0,0,3.6,1.0,0.962990


In [None]:
dfOnline = df[df['online class'] == 1]

dfOnline

Unnamed: 0,online class,during COVID,reputation,Quality,propensity
588,1,0,4.0,5.0,0.966435
611,1,0,4.0,3.5,0.962990
788,1,0,4.0,1.0,0.966435
928,1,0,4.0,4.5,0.979498
1012,1,0,4.0,1.0,0.955053
...,...,...,...,...,...
510254,1,0,3.6,3.0,0.986233
510255,1,0,3.6,3.5,0.655379
510256,1,0,3.6,1.5,0.979498
510258,1,0,3.6,1.0,0.986233


In [None]:
causalEffectWithTreatment = 0

for index, row in dfOnline.iterrows():
  causalEffectWithTreatment += row['Quality'] / row['propensity']

causalEffectWithTreatment /= (len(dfOnline)) 

causalEffectWithTreatment


3.575518365187849

In [None]:
causalEffectWithoutTreatment = 0

for index, row in dfNotOnline.iterrows():
  causalEffectWithoutTreatment += row['Quality'] / (row['propensity'])

causalEffectWithoutTreatment /= (len(dfNotOnline)) 

causalEffectWithoutTreatment

3.9160331216785718

In [None]:
causalEffectWithTreatment - causalEffectWithoutTreatment

-0.3405147564907227

In [None]:
causalEffectWithTreatment/causalEffectWithoutTreatment

0.9130459968263077

$$E[P^{O=1}] - E[P^{O=0}] = -0.3405147564907227$$
$$\frac{E[P^{O=1}]}{E[P^{O=0}]} = 0.9130459968263077$$

$\frac{1}{n_1} \sum_{i:O_i = 1} \frac{p_i}{P(O | C, R)}$
$$\frac{1}{n_2} \sum_{i:O_i = 0} \frac{p_i}{P(O | C, R)}$$$