In [24]:
import os
import pandas as pd
import numpy as np
from quasinet.qnet import Qnet, save_qnet, load_qnet, qdistance
from quasinet.qsampling import qsample
import argparse
from tqdm import tqdm

In [25]:
year = 2016
samplesize = 100
OUTDIR = './'
VAR='race'


In [26]:
POLEFILE = '../data/polar_vectors.csv'
MUTFILE = '../data/immutable.csv'
FEATURESBYYEAR = '../data/features_by_year_GSS.csv'
features_by_year = pd.read_csv(FEATURESBYYEAR,
                               keep_default_na=True,
                               index_col=0).set_index('year').loc[year].values[0]
cols=eval(features_by_year)

data = pd.read_csv(f'../data/gss_{year}.csv', keep_default_na=False, dtype=str)[cols]

In [27]:
training_data = data.sample(samplesize)

In [28]:
if VAR:
    vdict=data[VAR].value_counts().to_dict()
    data_s={k:training_data[training_data[VAR]==k] for k in vdict.keys()}
    training_data={k:d.loc[:, d.ne('').any()] for k,d in data_s.items()}
    training_index = {k:training_data[k].index.values for k in vdict.keys()}
    qmodel_path = {k:f'{OUTDIR}/gss_{year}{k}.pkl.gz' for k in vdict.keys()} 
    for k in vdict.keys():
        if not os.path.exists(qmodel_path[k]):
            X_training = training_data[k].values.astype(str)
            Q = Qnet(feature_names=training_data[k].columns, alpha=.1)
            Q.fit(X_training)
            Q.training_index = training_index[k]
            save_qnet(Q, qmodel_path[k].replace('.gz',''), gz=True)
        else:
            Q={k:load_qnet(qmodel_path[k]) for k in vdict.keys()} 
else:
    qmodel_path = f'{OUTDIR}/gss_{year}.pkl.gz'
    if not os.path.exists(qmodel_path):
        X_training = training_data.values.astype(str)
        Q = Qnet(feature_names=training_data.columns, alpha=.1)
        Q.fit(X_training)
        Q.training_index = training_index
        save_qnet(Q, qmodel_path.replace('.gz',''), gz=True)
    else:
        Q=load_qnet(qmodel_path)


In [34]:

def triangle_area(a, b, c):
    """Calculate the area of a triangle given its side lengths using Heron's formula."""
    s = (a + b + c) / 2
    a=(s * (s - a) * (s - b) * (s - c))
    if a > 0:
         return math.sqrt(s * (s - a) * (s - b) * (s - c))
    else:
         return 0.

def calculate_changes(triangle1, triangle2):
    """Calculate changes in area and side lengths between two triangles."""
    area1 = triangle_area(*triangle1)
    area2 = triangle_area(*triangle2)

    area_change = area2 - area1
    #side_length_changes = [triangle2[i] - triangle1[i] for i in range(3)]

    return area_change


def getTau(df):

    Z=df.head(1).values[0]
    def getChange(row,R0=Z[0],L0=Z[1],RL0=Z[2]):
        return calculate_changes((R0,L0,RL0),(row.R,row.L,row.RL))
    
    df['dA']=df.apply(getChange,axis=1)
    N=4
    df_= df[N:]

    response_data = df_['dA'].values
    time = np.arange(len(response_data))

    def decay_function(t, A, tau, C):
        return A * np.exp(-t / tau) + C

    params, covariance = curve_fit(decay_function, time, response_data)
    return params[1],np.sqrt(covariance[1][1])


In [39]:
import math
from scipy.optimize import curve_fit

sp=pd.read_csv(POLEFILE, index_col=0).T
T=1000
if VAR:
    
    for k in vdict.keys():
        NULL={k:np.array(['']*len(Q[k].feature_names)).astype('U100') for k in vdict.keys()} 
        sp_={k:pd.concat([pd.DataFrame(columns=Q[k].feature_names),
                       sp])[Q[k].feature_names].fillna('').values.astype(str) 
             for k in vdict.keys()} 
        
        D=pd.DataFrame({m:
                        (qdistance(qsample(sp_[k][0],Q[k],steps=m),NULL[k],Q[k],Q[k]),
                         qdistance(qsample(sp_[k][1],Q[k],steps=m),NULL[k],Q[k],Q[k]),
                         qdistance(qsample(sp_[k][0],Q[k],steps=m),
                                   qsample(sp_[k][1],Q[k],steps=m),Q[k],Q[k]))
                        for m in tqdm(np.arange(1,T,100))})
        D.to_csv(f'{OUTDIR}/relaxation_{year}{k}.csv')
        tau,cov=getTau(D.T.rename(columns={0:'R',1:'L',2:'RL'}))
        print(year,k,tau,cov)
else:
    sp_=pd.concat([pd.DataFrame(columns=feature_names),
               sp])[feature_names].fillna('').values.astype(str)

    NULL=np.array(['']*len(Q.feature_names)).astype('U100')
    D=pd.DataFrame({m:
                    (qdistance(qsample(sp_[0],Q,steps=m),NULL,Q,Q),
                     qdistance(qsample(sp_[1],Q,steps=m),NULL,Q,Q),
                     qdistance(qsample(sp_[0],Q,steps=m),
                               qsample(sp_[1],Q,steps=m),Q,Q))
                    for m in tqdm(np.arange(1,T,100))})
    D.to_csv(f'{OUTDIR}/relaxation_{year}.csv')


100%|███████████████████████████████████████████| 10/10 [00:25<00:00,  2.52s/it]


2016 white 60.792134872003864 1075.1695417050048


  0%|                                                    | 0/10 [19:30<?, ?it/s]
100%|███████████████████████████████████████████| 10/10 [00:12<00:00,  1.21s/it]


2016 black 1.1393246287182266 0.14614660826763204


100%|███████████████████████████████████████████| 10/10 [00:09<00:00,  1.03it/s]

2016 other 22342.87138362543 240425014.86756015





In [None]:
Q