# Dataset to Export
- for trend analysis, not matched with agent information

In [1]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import hashlib

# import custom functions from export_utils.py
from export_utils import *

## Load Configs
- classification maps for explicit measure

In [2]:
# general functions
def load_category_map():
    with open("classification_map.json","r")as f:
        cat_map = json.load(f)
    wsd_map = cat_map['wsd']
    pb_primary_map = cat_map['pb_primary']
    pb_secondary_map = cat_map['pb_secondary']
    return wsd_map, pb_primary_map,pb_secondary_map

def reverse_dict(dictionary):
    return {v: k for k, v in dictionary.items()}

wsd_map, pb_primary_map, pb_secondary_map = load_category_map()
reverse_secondary_map = reverse_dict(pb_secondary_map)

In [3]:
def create_pb_labels(df, reverse_secondary_map=reverse_secondary_map):
    df['pb_primary_label']= df.apply(lambda x: gen_true_label(x['pb_primary_predictions'], x['pb_primary_confidences']), axis=1)
    df['pb_secondary_label']= df.apply(lambda x: gen_true_label(x['pb_subcategory_predictions'], x['pb_subcategory_confidences']), axis=1)
    # given the classification, create a dummy variable for the primary category, and then run logit model
    df['IsPerson'] = df.apply(lambda row: 1 if (row['pb_primary_label']==0) and (row['pb_secondary_label'] not in [0,4,999]) else 0, axis=1) # 0: others, 4: household, 999: low confidence or not classified
    df['secondaryLabel'] = df['pb_secondary_label'].map(reverse_secondary_map)
    return df

## Congress

In [15]:
## explicit
congress_steps = [speechid_to_congress_df_operation, congress_variable_rename, remove_intransitive, create_pb_labels, explicit_sent_hash]
process_explicit_data(
    data_path="/zfs/projects/faculty/amirgo-management/code_review_data/congress/",
    filename="congress_mgmt_sent_wsd_pb_classified.csv",
    processing_steps=congress_steps,
    export_filename="congress_explicit_trend.csv"
)

100%|██████████| 307863/307863 [00:03<00:00, 100894.80it/s]


In [16]:
## implicit
congress_steps = [speechid_to_congress_df_operation, congress_variable_rename, implicit_sent_hash]
process_implicit_data(
    data_path="/zfs/projects/faculty/amirgo-management/code_review_data/congress/",
    filename="congress_implicit_measure.pkl",
    processing_steps=congress_steps,
    export_filename="congress_implicit_trend.csv"
)

100%|██████████| 4052432/4052432 [00:33<00:00, 120702.69it/s]


## Movie

In [17]:
## explicit
movie_steps = [movie_variable_rename, remove_intransitive, create_pb_labels, explicit_sent_hash]
process_explicit_data(
    data_path="/zfs/projects/faculty/amirgo-management/code_review_data/movie/",
    filename="movie_mgmt_sent_wsd_pb_classified.csv",
    processing_steps=movie_steps,
    export_filename="movie_explicit_trend_v2.csv"
)

100%|██████████| 28253/28253 [00:00<00:00, 105998.70it/s]


In [32]:
## implicit
movie_steps = [movie_variable_rename, implicit_sent_hash]
process_implicit_data(
    data_path="/zfs/projects/faculty/amirgo-management/code_review_data/movie/",
    filename="movie_implicit_measure.pkl",
    processing_steps=movie_steps,
    export_filename="movie_implicit_trend.csv"
)

100%|██████████| 2295968/2295968 [00:18<00:00, 122462.84it/s]


## Caselaw

In [None]:
caselaw_steps = [caseid_to_year, case_variable_rename, remove_intransitive, create_pb_labels, explicit_sent_hash]
process_explicit_data(
    data_path="/zfs/projects/faculty/amirgo-management/code_review_data/caselaw/total/",
    filename="caselaw_mgmt_sent_wsd_pb_classified.csv",
    processing_steps=caselaw_steps,
    export_filename="caselaw_explicit_trend_v2.csv"
)

In [16]:
# multiple parts of implicit files 
files = [f"caselaw_implicit_measure_p{i}.pkl" for i in range(1, 7)]
caselaw_steps = [caseid_to_year, case_variable_rename, implicit_sent_hash]
for i, file in enumerate(files):
    process_implicit_data(
        data_path="/zfs/projects/faculty/amirgo-management/code_review_data/caselaw/total/",
        filename=file,
        processing_steps=caselaw_steps,
        export_filename=f"caselaw_implicit_trend_p{i+1}.csv"
    )

100%|██████████| 4418364/4418364 [00:38<00:00, 114836.17it/s]
100%|██████████| 4425640/4425640 [00:36<00:00, 121531.39it/s]
100%|██████████| 4439993/4439993 [00:35<00:00, 124179.81it/s]
100%|██████████| 4447293/4447293 [00:36<00:00, 122060.81it/s]
100%|██████████| 4381942/4381942 [00:37<00:00, 118178.78it/s]
100%|██████████| 4406786/4406786 [00:35<00:00, 123995.39it/s]


In [None]:
# merge all parts of implicit files
caselaw_implicit = pd.concat([pd.read_csv(f"/zfs/projects/faculty/amirgo-management/code_review_data/caselaw/total/caselaw_implicit_trend_p{i}.csv") for i in range(1,7)], ignore_index=True)
caselaw_implicit.to_csv("/zfs/projects/faculty/amirgo-management/code_review_data/caselaw/total/caselaw_implicit_trend.csv", index=False)

## NYT

In [17]:
def create_pb_labels_nyt(df, reverse_secondary_map=reverse_secondary_map):
    df['IsPerson'] = df.apply(lambda row: 1 if (row['pb_primary_label']==0) and (row['pb_secondary_label'] not in [0,4,999]) else 0, axis=1) # 0: others, 4: household, 999: low confidence or not classified
    df['secondaryLabel'] = df['pb_secondary_label'].map(reverse_secondary_map)
    return df

In [24]:
path = "/zfs/projects/faculty/amirgo-management/code_review_data/nyt/"
explicit_df = pd.read_parquet(path + "explicit_measure_full.parquet.gzip", engine='pyarrow')
explicit_df = nyt_variable_rename(explicit_df)
explicit_df = create_pb_labels_nyt(explicit_df)
explicit_df = idx_sent_hash(explicit_df)
export_explicit_df(explicit_df, path + "nyt_explicit_trend.csv")

100%|██████████| 4142211/4142211 [00:29<00:00, 140507.84it/s]


In [28]:
implicit_df = pd.read_parquet(path + "1831_implicit_measure_full.parquet.gzip", engine='pyarrow')
implicit_df.reset_index(drop=True,inplace=True)
implicit_df = nyt_variable_rename(implicit_df)
implicit_df = idx_sent_hash(implicit_df)
export_implicit_df(implicit_df, path + "nyt_implicit_trend.csv")

100%|██████████| 29023124/29023124 [03:28<00:00, 138906.31it/s]


## Fiction

In [None]:
path = "/zfs/projects/faculty/amirgo-management/code_review_data/fiction/"
meta_df = pd.read_csv("/zfs/projects/faculty/amirgo-management/HathiTrust/post45fiction.csv")
filename_to_year_dict = meta_df.set_index('docid')['latestcomp'].to_dict()

def filename_to_year(df):
    df['year'] = df['filename'].apply(lambda x: filename_to_year_dict.get(x[:-4], np.nan))
    return df

In [8]:
fiction_steps = [filename_to_year, fiction_variable_rename, remove_intransitive, create_pb_labels]
process_explicit_data(
    data_path="/zfs/projects/faculty/amirgo-management/code_review_data/fiction/",
    filename="fiction_explicit_export.csv",
    processing_steps=fiction_steps,
    export_filename="fiction_explicit_trend.csv"
)

Dataset before processing: 965333 rows, 12 columns.
Dataset after processing: 367244 rows, 18 columns.


In [19]:
implicit_df = pd.read_csv("/zfs/projects/faculty/amirgo-management/code_review_data/fiction/fiction_implicit_export_nohash.csv")
implicit_df = filename_to_year(implicit_df)
implicit_df = fiction_variable_rename(implicit_df)
implicit_df = idx_sent_hash(implicit_df)
export_implicit_df(implicit_df, "/zfs/projects/faculty/amirgo-management/code_review_data/fiction/fiction_implicit_trend.csv")

100%|██████████| 40101869/40101869 [04:47<00:00, 139625.62it/s]


## Merge

In [2]:
data_paths = {"congress": "/zfs/projects/faculty/amirgo-management/code_review_data/congress/",
                "movie": "/zfs/projects/faculty/amirgo-management/code_review_data/movie/",
                "caselaw": "/zfs/projects/faculty/amirgo-management/code_review_data/caselaw/total/",
                "nyt": "/zfs/projects/faculty/amirgo-management/code_review_data/nyt/",
                "fiction": "/zfs/projects/faculty/amirgo-management/code_review_data/fiction/"}

pooling_path = "/zfs/projects/faculty/amirgo-management/code_review_data/pooling/"

In [5]:
explicit_dfs = []
for key, path in data_paths.items():
    explicit_df = pd.read_csv(path + f"{key}_explicit_trend.csv")
    explicit_dfs.append(explicit_df)
explicit_df = pd.concat(explicit_dfs, ignore_index=True)
explicit_df['decade'] = explicit_df['year'].apply(lambda x: x//10*10)
print(explicit_df.columns)
explicit_df.to_csv(pooling_path+"explicit_trend.csv",index=False)

Index(['year', 'dataset', 'doc_id', 'sentence_id', 'IsPerson',
       'secondaryLabel', 'decade'],
      dtype='object')


In [7]:
explicit_df['dataset'].value_counts()

dataset
nyt         4142211
caselaw     1552525
fiction      367244
congress     307863
movie         28253
Name: count, dtype: int64

In [3]:
implicit_dfs = []
for key, path in data_paths.items():
    implicit_df = pd.read_csv(path + f"{key}_implicit_trend.csv")
    implicit_dfs.append(implicit_df)
implicit_df = pd.concat(implicit_dfs, ignore_index=True)
implicit_df['decade'] = implicit_df['year'].apply(lambda x: x//10*10)
print(implicit_df.columns)
implicit_df.to_csv(pooling_path+"implicit_trend.csv",index=False)

Index(['year', 'dataset', 'doc_id', 'sentence_id', 'object',
       'subgroup_orig_syn_ratio', 'top_subgroup', 'decade'],
      dtype='object')


In [4]:
implicit_df['dataset'].value_counts()

dataset
fiction     40101869
nyt         29023124
caselaw     26520018
congress     4052432
movie        2295968
Name: count, dtype: int64