# Dataset to Export
- for agent analysis, match the agent to the dataset

In [1]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import hashlib

# import custom functions from export_utils.py
from export_utils import *

In [2]:
# load category map for object categories
with open('/zfs/projects/faculty/amirgo-management/code_review/3_feature_generation/implicit/fillmask_target_objects.json') as f:
    category_map = json.load(f)
category_map['lemmatized'].keys()

dict_keys(['subjectivity', 'body', 'relationship', 'relational_subjectivity'])

# Congress

In [3]:
# prepare metadata
congress_path="/zfs/projects/faculty/amirgo-management/congress/"
# load speaker - speech mapping
total_speaker_map = pd.read_pickle(congress_path+"/speeches_processed/total_speaker_map.pkl") # note that the same speaker can have multiple speaker ids (e.g. if they are in multiple congresses terms)
# load wiki data mapping
meta_df=pd.read_csv(congress_path + "congress_meta_data_flat.csv")
total_speaker_map['congress'] = total_speaker_map['speech_id'].progress_apply(speechid_to_congress)
total_speaker_map = total_speaker_map.merge(meta_df, on=['firstname','lastname','state','gender','congress'], how='left') # merge in wiki data
total_speaker_map.drop_duplicates(subset=['speech_id'],inplace=True)

100%|██████████| 6469769/6469769 [00:07<00:00, 876167.56it/s] 


In [None]:
# Explicit
## matching
data_path="/zfs/projects/faculty/amirgo-management/code_review_data/congress/"
explicit_df = pd.read_csv(data_path + "congress_explicit_trend.csv")
print("Shape before matching: ", explicit_df.shape)
explicit_matched = explicit_df.merge(total_speaker_map, left_on='doc_id', right_on='speech_id', how='left')
explicit_matched.dropna(subset=['speakerid'], inplace=True)
print("Shape after matching: ", explicit_matched.shape) # about 20% of the data is lost

## add new columns: age, decade
explicit_matched['birth_year'] = explicit_matched['birthday'].apply(lambda x: int(x.split('-')[0]) if type(x)==str else np.nan)
explicit_matched['age'] = explicit_matched.apply(lambda x: generate_age(x['year'], x['birth_year']), axis=1)
explicit_matched['decade']= explicit_matched['year'].apply(lambda x: x//10*10)

## convert gender to female and male
explicit_matched['gender']=explicit_matched['gender'].apply(lambda x: 'female' if x=="F" else "male") # no missing values

## save
export_explicit_agent_match(explicit_matched, data_path + "congress_explicit_agent.csv")

Shape before matching:  (307863, 6)
Shape after matching:  (249849, 23)


In [4]:
# Implicit
## matching
data_path="/zfs/projects/faculty/amirgo-management/code_review_data/congress/"
implicit_df = pd.read_csv(data_path + "congress_implicit_trend.csv")
print("Shape before matching: ", implicit_df.shape)
implicit_matched = implicit_df.merge(total_speaker_map, left_on='doc_id', right_on='speech_id', how='left')
implicit_matched.dropna(subset=['speakerid'], inplace=True)
print("Shape after matching: ", implicit_matched.shape) # about 20% of the data is lost

## add new columns: age, object category, decade
implicit_matched['birth_year'] = implicit_matched['birthday'].apply(lambda x: int(x.split('-')[0]) if type(x)==str else np.nan)
implicit_matched['age'] = implicit_matched.apply(lambda x: generate_age(x['year'], x['birth_year']), axis=1)
implicit_matched['decade']= implicit_matched['year'].apply(lambda x: x//10*10)
implicit_matched = gen_group_category(implicit_matched, category_map)

## convert gender to female and male
implicit_matched['gender']=implicit_matched['gender'].apply(lambda x: 'female' if x=="F" else "male") # no missing values

## save
export_implicit_agent_match(implicit_matched, data_path + "congress_implicit_agent.csv")

Shape before matching:  (4052432, 7)
Shape after matching:  (3280718, 24)


# Movie

In [5]:
# prepare metadata
movie_path = "/zfs/projects/faculty/amirgo-management/opus/processed/"
# indiviual data
meta_df = pd.read_csv(movie_path + "opus_ses_characteristics_labelled_subset.csv")
map_df = pd.read_pickle(movie_path + "imdb_wikidata_person_mapping.pkl")
map_df = map_df.merge(meta_df, left_on="wikidata_id", right_on="qid", how="left") # merge people's imdb id with their wiki data id

# # movie-director mapping
imdb_path = "/zfs/projects/faculty/amirgo-management/imdb/"
imdb_crew = pd.read_csv(imdb_path+"title.crew.tsv", sep='\t')
imdb_crew['tconst'] = imdb_crew['tconst'].astype(str)

In [4]:
# Explicit
## matching
data_path="/zfs/projects/faculty/amirgo-management/code_review_data/movie/"
explicit_df = pd.read_csv(data_path + "movie_explicit_trend.csv")
print("Shape before matching: ", explicit_df.shape)
explicit_matched = explicit_df.merge(imdb_crew, left_on='doc_id', right_on='tconst', how='left')
explicit_matched.dropna(subset=['writers'], inplace=True)
explicit_matched['writers'] = explicit_matched['writers'].apply(lambda x: x.split(',')) # if more than one writer
explicit_matched = explicit_matched.explode('writers')
explicit_matched = explicit_matched.merge(map_df, left_on='writers', right_on='imdb_id', how='left')
explicit_matched.dropna(subset=['wikidata_id'], inplace=True)
print("Shape after matching: ", explicit_matched.shape) # on average, 3 writers per movie

## add new columns: age, party, decade
explicit_matched['age'] = explicit_matched.apply(lambda x: generate_age(x['year'], x['birth_year']), axis=1)
explicit_matched['party']= 'NAN'
explicit_matched['decade']= explicit_matched['year'].apply(lambda x: x//10*10)

# rename variable
explicit_matched.rename(columns={'gender_str':'gender'}, inplace=True)

## save
export_explicit_agent_match(explicit_matched, data_path + "movie_explicit_agent.csv")

Shape before matching:  (28253, 6)
Shape after matching:  (56888, 18)


In [6]:
# Implicit
## matching
data_path="/zfs/projects/faculty/amirgo-management/code_review_data/movie/"
implicit_df = pd.read_csv(data_path + "movie_implicit_trend.csv")
print("Shape before matching: ", implicit_df.shape)
implicit_matched = implicit_df.merge(imdb_crew, left_on='doc_id', right_on='tconst', how='left')
implicit_matched.dropna(subset=['writers'], inplace=True)
implicit_matched['writers'] = implicit_matched['writers'].apply(lambda x: x.split(',')) # if more than one writer
implicit_matched = implicit_matched.explode('writers')
implicit_matched = implicit_matched.merge(map_df, left_on='writers', right_on='imdb_id', how='left')
implicit_matched.dropna(subset=['wikidata_id'], inplace=True)
print("Shape after matching: ", implicit_matched.shape) # on average, 3 writers per movie

## add new columns: age, party, decade, object category
implicit_matched['age'] = implicit_matched.apply(lambda x: generate_age(x['year'], x['birth_year']), axis=1)
implicit_matched['party']= 'NAN'
implicit_matched['decade']= implicit_matched['year'].apply(lambda x: x//10*10)
implicit_matched = gen_group_category(implicit_matched, category_map)

# rename variable
implicit_matched.rename(columns={'gender_str':'gender'}, inplace=True)

## save
export_implicit_agent_match(implicit_matched, data_path + "movie_implicit_agent.csv")

Shape before matching:  (2295968, 7)
Shape after matching:  (5078593, 19)


# Fiction

In [7]:
# fiction
fiction_path = "/zfs/projects/faculty/amirgo-management/HathiTrust/"
ses_df = pd.read_csv(fiction_path + "fiction_ses_characteristics_labelled_subset.csv")
meta_df = pd.read_csv(fiction_path + "post45fiction.csv")

meta_df = meta_df.merge(ses_df, left_on="author_wikidata_qid", right_on="qid", how="left")
meta_subset = meta_df[['docid','author','author_wikidata_qid', 'title', 'inferreddate', 'latestcomp','birth_year', 'gender_str', 'if_college_ed',
       'if_business_occupation', 'if_business_ed']].copy()
meta_subset = meta_subset[(meta_subset['latestcomp'] < 2010) & (meta_subset['latestcomp'] >= 1950)]

In [12]:
## Explicit
data_path="/zfs/projects/faculty/amirgo-management/code_review_data/fiction/"
explicit_df = pd.read_csv(data_path + "fiction_explicit_trend.csv")
print("Shape before matching: ", explicit_df.shape)
explicit_df['doc_id'] = explicit_df['doc_id'].apply(lambda x: x[:-4]) # remove .txt
explicit_matched = explicit_df.merge(meta_subset, left_on='doc_id', right_on='docid', how='left')
explicit_matched.dropna(subset=['author_wikidata_qid'], inplace=True)
print("Shape after matching: ", explicit_matched.shape)

## add new columns: age, party, decade
explicit_matched['age'] = explicit_matched.apply(lambda x: generate_age(x['year'], x['birth_year']), axis=1)
explicit_matched['party']= 'NAN'
explicit_matched['decade']= explicit_matched['year'].apply(lambda x: x//10*10)

# rename variable
explicit_matched.rename(columns={'gender_str': 'gender','author_wikidata_qid': 'wikidata_id'}, inplace=True)

## save
export_explicit_agent_match(explicit_matched, data_path + "fiction_explicit_agent.csv")

Shape before matching:  (367244, 6)
Shape after matching:  (216807, 17)


In [8]:
## Implicit
data_path="/zfs/projects/faculty/amirgo-management/code_review_data/fiction/"
implicit_df = pd.read_csv(data_path + "fiction_implicit_trend.csv")
print("Shape before matching: ", implicit_df.shape)
implicit_df['doc_id'] = implicit_df['doc_id'].apply(lambda x: x[:-4]) # remove .txt
implicit_matched = implicit_df.merge(meta_subset, left_on='doc_id', right_on='docid', how='left')
implicit_matched.dropna(subset=['author_wikidata_qid'], inplace=True)
print("Shape after matching: ", implicit_matched.shape)

## add new columns: age, party, decade, object category
implicit_matched['age'] = implicit_matched.apply(lambda x: generate_age(x['year'], x['birth_year']), axis=1)
implicit_matched['party']= 'NAN'
implicit_matched['decade']= implicit_matched['year'].apply(lambda x: x//10*10)
implicit_matched = gen_group_category(implicit_matched, category_map)

# rename variable
implicit_matched.rename(columns={'gender_str': 'gender','author_wikidata_qid': 'wikidata_id'}, inplace=True)

## save
export_implicit_agent_match(implicit_matched, data_path + "fiction_implicit_agent.csv")

Shape before matching:  (40101869, 7)
Shape after matching:  (23071979, 18)


# Merge

In [9]:
data_paths = {"congress": "/zfs/projects/faculty/amirgo-management/code_review_data/congress/",
                "movie": "/zfs/projects/faculty/amirgo-management/code_review_data/movie/",
                "fiction": "/zfs/projects/faculty/amirgo-management/code_review_data/fiction/"}

pooling_path = "/zfs/projects/faculty/amirgo-management/code_review_data/pooling/"

In [11]:
explicit_dfs = []
for key, path in data_paths.items():
    explicit_df = pd.read_csv(path + f"{key}_explicit_agent.csv")
    explicit_dfs.append(explicit_df)
explicit_df = pd.concat(explicit_dfs, ignore_index=True)
print(explicit_df.columns)
explicit_df.to_csv(pooling_path+"explicit_agent.csv",index=False)

Index(['year', 'doc_id', 'sentence_id', 'IsPerson', 'secondaryLabel',
       'wikidata_id', 'party', 'birth_year', 'age', 'gender', 'if_college_ed',
       'if_business_occupation', 'if_business_ed', 'dataset', 'decade'],
      dtype='object')


In [12]:
explicit_df['dataset'].value_counts()

dataset
congress    249849
fiction     216807
movie        56888
Name: count, dtype: int64

In [10]:
implicit_dfs = []
for key, path in data_paths.items():
    implicit_df = pd.read_csv(path + f"{key}_implicit_agent.csv")
    implicit_dfs.append(implicit_df)
implicit_df = pd.concat(implicit_dfs, ignore_index=True)
print(implicit_df.columns)
implicit_df.to_csv(pooling_path+"implicit_agent.csv",index=False)

  implicit_df = pd.read_csv(path + f"{key}_implicit_agent.csv")


Index(['year', 'doc_id', 'sentence_id', 'object', 'top_subgroup',
       'subgroup_orig_syn_ratio', 'wikidata_id', 'party', 'birth_year', 'age',
       'gender', 'if_college_ed', 'if_business_occupation', 'if_business_ed',
       'dataset', 'decade', 'object_category', 'if_relational_subjectivity'],
      dtype='object')


In [11]:
implicit_df['dataset'].value_counts()

dataset
fiction     23071979
movie        5078593
congress     3280718
Name: count, dtype: int64

In [14]:
implicit_df['dataset'].value_counts()

dataset
fiction     23071979
movie        5078593
congress     3280718
Name: count, dtype: int64