-
Notifications
You must be signed in to change notification settings - Fork 1
/
stataConference2021.do
164 lines (128 loc) · 7.14 KB
/
stataConference2021.do
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
// Start the Python interpreter from Stata
python
# Loads all of the modules needed in this script
import json
import requests
import pandas as pd
import spacy
from sfi import ValueLabel, Data, SFIToolkit
# Torch is Facebook's Deep Learning Library and it runs reasonably on a CPU only machine
import torch
torch.manual_seed(0)
# For examples using BERT see the jupyter notebook in the repository
# This will load the tokenizers and models using the BERT architecture
# from transformers import BertTokenizer, BertModel
# This will initialize the tokenizer and download the pretrained model parameters
# You can also use 'bert-large-cased' if you are using Stata SE or Stata MP.
# 'bert-large-cased' will produce 1,024 dimensional vectors, while
# 'bert-base-cased' will return only 768 dimensional vectors.
# If you really need something more expressive, there are other pre-trained models available that will return
# > 2,000 dimension vectors (e.g., GPTNeo, xlm-mlm-en-2048, alberta-xxlarge-v1/2)
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case = False)
# We'll also load up the model for spaCy at this time
nlp = spacy.load('en_core_web_lg')
# List of the URLs containing the data set
files = [ "https://raw.githubusercontent.com/DenisPeskov/2020_acl_diplomacy/master/data/test.jsonl",
"https://raw.githubusercontent.com/DenisPeskov/2020_acl_diplomacy/master/data/train.jsonl",
"https://raw.githubusercontent.com/DenisPeskov/2020_acl_diplomacy/master/data/validation.jsonl" ]
# Function to handle dropping "variables" that prevent pandas from
# reading the JSON object
def normalizer(obs: dict, drop: list) -> pd.DataFrame:
# Loop over the "variables" to drop
for i in drop:
# Remove it from the dictionary object
del obs[i]
# Returns the Pandas dataframe
return pd.DataFrame.from_dict(obs)
# Object to store each of the data frames
data = []
# Loop over each of the files from the URLs above
for i in files:
# Get the raw content from the GitHub location
content = requests.get(i).content
# Split the JSON objects by new lines, pass each individual line to json.loads,
# pass the json.loads value to the normalizer function, and
# append the result to the data object defined outside of the loop
# You should be able to add '_ = ' at the start of the next line to suppress the output
[ data.append(normalizer(json.loads(i), [ "players", "game_id" ])) for i in content.decode('utf-8').splitlines() ]
# Define a couple data mappings for later use
labmap = { True: 1, False: 0, 'NOANNOTATION': -1 }
cntrys = { 'austria': 0, 'england': 1, 'france': 2, 'germany': 3, 'italy': 4, 'russia': 5, 'turkey': 6 }
seasons = { 'Fall': 0, 'Winter': 1, 'Spring': 2 }
# Combine each of the data frames for each game into one large dataset
dataset = pd.concat(data, axis = 0, join = 'inner', ignore_index = True, sort = False)
# Change data types of a couple columns
dataset['game_score'] = dataset['game_score'].astype('int')
dataset['sender_labels'] = dataset['sender_labels'].astype('int')
dataset['absolute_message_index'] = dataset['absolute_message_index'].astype('int')
dataset['relative_message_index'] = dataset['relative_message_index'].astype('int')
dataset['game_score_delta'] = dataset['game_score_delta'].astype('int')
dataset['years'] = dataset['years'].astype('int')
# Recodes text labels to numeric values
dataset.replace({'receiver_labels': labmap, 'speakers': cntrys, 'receivers': cntrys, 'seasons': seasons}, inplace = True)
# Creates an indicator for when the receiver correctly identifies the truthfulness of the message
dataset['correct'] = (dataset['sender_labels'] == dataset['receiver_labels']).astype('int')
# Get the number of tokens per message using spaCy's tokenizer
dataset['tokens'] = dataset['messages'].apply(lambda x: len(nlp(x)))
# Expand's data by token
dataset['token'] = dataset['messages'].apply(lambda x: nlp(x))
# Now the data set can be expanded by unique tokens
dataset = dataset.explode('token')
# Make sure the token variable is cast as a string
# If you don't do this you'll get an error saying that Stata couldn't store the
# string value in the current Stata dataset
dataset['token'] = dataset['token'].astype('str')
# Then add ID's for each token (these values should also use zero-based indexing)
dataset['tokenid'] = dataset.groupby('messages').cumcount()
# Get the names of the variables
varnms = dataset.columns
# Sets the number of observations based on the messages column
Data.setObsTotal(len(dataset['messages']))
# Create the variables in Stata
for var in varnms:
if var not in [ 'messages', 'token' ]:
Data.addVarLong(var)
else:
Data.addVarStrL(var)
# Now push the data into Stata
Data.store(var = None, obs = None, val = dataset.values.tolist())
# Create mapping of value labels to variables
vallabmap = { 'sender_labels' : labmap, 'receiver_labels': labmap, 'seasons': seasons, 'speakers': cntrys, 'receivers': cntrys }
# Loop over the dictionary containing the value label mappings
for varnm, vallabs in vallabmap.items():
# Start the string that defines the value labels
ValueLabel.createLabel(varnm)
# Now iterate over the value label mappings
# Again if you want to suppress the output add '_ = ' at the start of the next line
[ ValueLabel.setLabelValue(varnm, value, str(label)) for label, value in vallabs.items() ]
# Now this string can be used to define the value labels in Stata
ValueLabel.setVarValueLabel(varnm, varnm)
# Since we know the length of the vector in advance, we can create all of the
# variables that we want, so we'll create variables for individual word vectors
# Again if you want to suppress the output add '_ = ' at the start of the next line
[ Data.addVarDouble('wembed' + str(i)) for i in range(1, 301) ]
# Gets all of the messages
for ob, token in enumerate(dataset['token'].tolist()):
# Gets the spaCy embeddings
embed = nlp(token)
# Stores the word vector for this word
# Again if you want to suppress the output add '_ = ' at the start of the next line
[ Data.storeAt("wembed" + str(dim + 1), ob, embed.vector[dim]) for dim in range(0, len(embed.vector)) ]
# You can now fit a model to the data:
SFIToolkit.stata("logit correct i.speakers i.seasons i.years i.game_score wembed1-wembed300")
# These results are fairly noisy, so maybe there would be better luck using document vectors
SFIToolkit.stata("drop token tokenid wembed*")
SFIToolkit.stata("duplicates drop")
# Now use the same process used above, but using document vectors
[ Data.addVarDouble('docembed' + str(i)) for i in range(1, 301) ]
for ob, token in enumerate(dataset['messages'].tolist()):
# Gets the spaCy embeddings
embed = nlp(token)
# Stores the word vector for this word
# Again if you want to suppress the output add '_ = ' at the start of the next line
[ Data.storeAt("docembed" + str(dim + 1), ob, embed.vector[dim]) for dim in range(0, len(embed.vector)) ]
# This model fits the data a bit better than the previous model and is also
# noticably faster.
SFIToolkit.stata("logit correct i.speakers i.seasons i.years i.game_score docembed1-docembed300")
end
// Now you can start using the embeddings for additional models