In [None]:
import numpy as np
import pandas as pd
import json
from pathlib import Path
import re
import spacy

### load variable mapping

In [None]:
with Path("variable_mapping.json").open("r") as f:
    mapping = json.load(f, parse_int=True)

In [None]:
# convert the encoded values in the mapping to integers since they get read in as strings
for c in mapping.keys():
    mapping[c] = {int(k): v for k, v in mapping[c].items()}

### load primary data

In [None]:
df = pd.read_csv(
    "corrected_narrative_primary.csv",
    # set columns that can be null to nullable ints
    dtype={"body_part_2": "Int64", "diagnosis_2": "Int64"},
)
df.head()

### replace numeric values with corresponding strings

In [None]:
decoded_df = df.copy()

for col in mapping.keys():
    decoded_df[col] = decoded_df[col].map(mapping[col])

In [None]:
# ensure mappings were applied correctly by checking that the number of missing values did not change
assert (decoded_df.isnull().sum() == df.isnull().sum()).all()

In [None]:
random_sample = decoded_df.sample(n=5000, replace=False, random_state=42)

In [None]:
random_sample_sub = random_sample.iloc[:,0:2]

In [None]:
random_sample_sub.head()

# NLP

In [None]:
print(spacy.__version__)

# My Model

Corrected data

In [None]:
fall = pd.read_csv('fall_set5_corrected_narrative.csv')

In [None]:
fall.head()

In [None]:
fall.tail(10)

In [None]:
fall.drop_duplicates(inplace=True)
len(fall)

In [None]:
fall_nas = fall[fall['span'].isna()]

In [None]:
len(fall_nas)

In [None]:
all_labels = fall.label.unique()
all_labels

In [None]:
labels_to_use = ['CHR', 'TRS', 'LAD', 'OBJ', 'RCH', 'SF', 'SHW', 'SO', 'STR', 'SU', 'WT']
# labels_to_use = ['SO']

In [None]:
fall_sub = fall[fall['label'].isin(labels_to_use)]

In [None]:
fall_all = pd.concat((fall_sub, fall_nas))

In [None]:
fall.tail()

In [None]:
import random
random.seed(99)
randomlist = random.sample(range(0, len(fall_all)), len(fall_all), )
print(randomlist[1:5])

In [None]:
fall_all_random = fall_all.iloc[randomlist,]
fall_all_random.tail()

In [None]:
key_list = fall_all_random.key_entry.unique()
len(key_list)

In [None]:
fall_all_random[fall_all_random.key_entry == key_list[1]]

In [None]:
nlp = spacy.blank("en")
# special_case = [{Token.ORTH: "CLOSED-HEAD"}]
# nlp.tokenizer.add_special_case("CLOSED-HEAD", special_case)

from spacy.tokens import Span

# keeping span token lengths to appropriately set config
token_lengths = []
label_list = []

docs=[] # this will hold the processed strings and spans
for k in key_list:
    doc = nlp(k)
    temp_df = fall_all_random[fall_all_random.key_entry == k]

    if len(temp_df)==1:    
        if pd.isna(temp_df.iloc[0,2]):
            doc.spans["sc"] = []
            docs.append(doc)
        else:     
            span_text = temp_df.iloc[0,1]
            temp_label = temp_df.iloc[0,2]       
            span_start_char = k.find(span_text)
            span_end_char = span_start_char + len(span_text)
    
            # Finding the start and end tokens using character offsets
            start_token = None
            end_token = None
            for token in doc:
                if token.idx == span_start_char:
                     start_token = token.i
                if token.idx + len(token.text) == span_end_char:
                    end_token = token.i
                    break
            if start_token is not None and end_token is not None:
                temp_start = start_token
                temp_end = end_token + 1
                doc.spans["sc"] = [Span(doc, temp_start, temp_end, temp_label)]
                docs.append(doc)
                token_lengths.append(temp_end - temp_start)
                label_list.append(temp_label)
            else:
                print(k, "span=", span_text,"couldn't find tokens")
    else:
        print("temp_df has length > 1")
        span_list = []
        for ent in range(len(temp_df)):
            span_text = temp_df.iloc[ent,1]
            temp_label = temp_df.iloc[ent,2]
            span_start_char = k.find(span_text)
            span_end_char = span_start_char + len(span_text)
            print(span_start_char, span_end_char)

            # Finding the start and end tokens using character offsets
            start_token = None
            end_token = None
            for token in doc:
                if token.idx == span_start_char:
                     start_token = token.i
                if token.idx + len(token.text) == span_end_char:
                    end_token = token.i
                    break
            if start_token is not None and end_token is not None:
                temp_start = start_token
                temp_end = end_token + 1             
                span_list.append(Span(doc, temp_start, temp_end, temp_label))
                token_lengths.append(temp_end - temp_start)
                label_list.append(temp_label)
            else:
                print(k, "span=",span_text, "couldn't find tokens")
        
        doc.spans["sc"] = span_list
    docs.append(doc)
          

In [None]:
len(docs)

In [None]:
len(token_lengths)

In [None]:
print(np.min(token_lengths), np.max(token_lengths), np.median(token_lengths))

In [None]:
np.quantile(token_lengths, q =[0.05,0.95])

In [None]:
pd.unique(np.array(label_list))

Make training and test sets with the docs

In [None]:
from spacy.tokens import DocBin

In [None]:
doc_bin = DocBin(docs=docs[0:800])

In [None]:
doc_bin.to_disk("./train_230929_mult_labs1017.spacy")

In [None]:
doc_bin = DocBin(docs=docs[800:])
doc_bin.to_disk("./dev_230929_mult_labs1017.spacy")

python -m spacy init config ./config.cfg --lang en --pipeline spancat

## Using model

In [None]:
nlp_spancat = spacy.load("/Users/wendyphillips/Documents/Computing/WendysPython/Falling_analysis/code/outputs230929_mult_labs1017/model-best")

In [None]:
nlp_spancat.pipeline

In [None]:
test_text = random_sample_sub.iloc[2101,1]
doc = nlp_spancat(test_text)
doc.text

In [None]:
doc.spans

In [None]:
for span in doc.spans["sc"]:
    print(span.label_, span.start, span.end, span.text)

In [None]:
random_sample_sub2 = random_sample_sub.iloc[2000:2050,]
for text in random_sample_sub2['narrative']:
    print(text)
    doc = nlp_spancat(text)
    for span in doc.spans["sc"]:
        print(span.label_, span.start, span.end, span.text)

In [None]:
random_sample_sub2 = random_sample_sub.iloc[2000:2500,]

# Create an empty DataFrame with column names
output_df = pd.DataFrame(columns=['text', 'span_label', 'span_text'])

for text in random_sample_sub2['narrative']:
    doc = nlp_spancat(text)
    
    if len(doc.spans["sc"]) == 0:
        df2 = pd.DataFrame([[text, "NA", "NA"]], columns=['text', 'span_text', 'span_label'])
        # Append the new row to the DataFrame
        output_df = pd.concat([output_df, df2])
    else:
        for span in doc.spans["sc"]:
        # Create a new row as a dictionary
        
            df2 = pd.DataFrame([[text, span.label_, span.text]], columns=['text', 'span_text', 'span_label'])
            # Append the new row to the DataFrame
            output_df = pd.concat([output_df, df2])

In [None]:
output_df.to_csv("predictions_1017.csv")