In [5]:
import pandas

In [24]:
data=pandas.read_csv('protocol_entities_stanza_results.tsv',sep='\t')

In [25]:
data.head(5)

Unnamed: 0,protocol_id,protocol_start_char,protocol_end_char,type,text
0,acute_bacterial_meningitis_0,0,26,PROBLEM,Acute bacterial meningitis
1,acute_bacterial_meningitis_0,30,108,PROBLEM,rapidly progressive bacterial infection of the...
2,acute_bacterial_meningitis_0,137,145,PROBLEM,headache
3,acute_bacterial_meningitis_0,147,152,PROBLEM,fever
4,acute_bacterial_meningitis_0,158,173,PROBLEM,nuchal rigidity


In [43]:
diabetes_mellitus_data = data[data['protocol_id'].str.startswith("diabetes_mellitus")]

In [44]:
diabetes_mellitus_data.head(5)

Unnamed: 0,protocol_id,protocol_start_char,protocol_end_char,type,text
3171,diabetes_mellitus_0_0,0,17,PROBLEM,Diabetes mellitus
3172,diabetes_mellitus_0_0,21,47,PROBLEM,impaired insulin secretion
3173,diabetes_mellitus_0_0,72,101,PROBLEM,peripheral insulin resistance
3174,diabetes_mellitus_0_0,113,126,PROBLEM,hyperglycemia
3175,diabetes_mellitus_0_0,128,142,PROBLEM,Early symptoms


In [8]:
import requests
def query_repository(text_name, repo_name):
    # URL-encode the text_name
    encoded_text_name = requests.utils.quote(text_name.lower())
    url = f"http://localhost:7200/repositories/{repo_name}?query=prefix schema: <https://schema.org/>  select ?o where {{ ?s a ?o . ?s schema:name ?name . filter(CONTAINS(lcase(?name), \"{encoded_text_name}\")) }}"
    headers = {'Accept': 'application/sparql-results+json'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        return "Error: " + str(response.status_code)

In [13]:
query_repository("statins", "DM")

'{\n  "head" : {\n    "vars" : [\n      "o"\n    ]\n  },\n  "results" : {\n    "bindings" : [\n      {\n        "o" : {\n          "type" : "uri",\n          "value" : "https://schema.org/Drug"\n        }\n      }\n    ]\n  }\n}'

In [13]:
def apply_query_repository(row, repo):
    return query_repository(str(row['text']), repo)

In [45]:
# Apply the function to each row and store the results in a new column
diabetes_mellitus_data['query_results'] = diabetes_mellitus_data.apply(lambda row: apply_query_repository(row, "DM"), axis=1)

print(diabetes_mellitus_data[['text', 'query_results']])

                                               text  \
3171                              Diabetes mellitus   
3172                     impaired insulin secretion   
3173                  peripheral insulin resistance   
3174                                  hyperglycemia   
3175                                 Early symptoms   
...                                             ...   
5945  renin-angiotensin-aldosterone system blockers   
5946                                        statins   
5947                                  complications   
5948                  renin-angiotensin-aldosterone   
5949                                        statins   

                                          query_results  
3171  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
3172  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
3173  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
3174  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
3175  {\n  "head" : {\n    "vars" : [\n      "o"\

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_mellitus_data['query_results'] = diabetes_mellitus_data.apply(lambda row: apply_query_repository(row, "DM"), axis=1)


In [10]:
import json
def extract_value_from_json(json_str):
    if json_str.startswith('Error'):  # Check for error responses
        return ''  # Return empty string for errors or no response
    try:
        data = json.loads(json_str)
        bindings = data.get('results', {}).get('bindings', [])
        if not bindings:
            return ''  # Return empty if no bindings are present
        for item in bindings:
            o_value_url = item.get('o', {}).get('value', '')
            # Remove the 'https://schema.org/' prefix and return the last part
            if o_value_url.startswith('https://schema.org/'):
                return o_value_url.replace('https://schema.org/', '')
        return ''  # Return empty if the expected format is not found
    except json.JSONDecodeError:
        return ''  # Return empty string in case of JSON decode error

In [47]:
# Apply the function to extract the specific value from each row's 'query_results'
diabetes_mellitus_data['query_results'] = diabetes_mellitus_data['query_results'].apply(extract_value_from_json)

print(diabetes_mellitus_data[['text', 'query_results']])

                                               text         query_results
3171                              Diabetes mellitus      MedicalCondition
3172                     impaired insulin secretion  MedicalSignOrSymptom
3173                  peripheral insulin resistance          MedicalCause
3174                                  hyperglycemia      MedicalCondition
3175                                 Early symptoms                      
...                                             ...                   ...
5945  renin-angiotensin-aldosterone system blockers                  Drug
5946                                        statins                  Drug
5947                                  complications        MedicalWebPage
5948                  renin-angiotensin-aldosterone                  Drug
5949                                        statins                  Drug

[2779 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_mellitus_data['query_results'] = diabetes_mellitus_data['query_results'].apply(extract_value_from_json)


In [19]:
file_path = 'dm_with_kg_ner.tsv'

# Save the DataFrame to a TSV file
diabetes_mellitus_data.to_csv(file_path, sep='\t', index=False)

### MI

In [21]:
MI_data = data[data['protocol_id'].str.startswith("acute_myocardial_infarction")]

In [22]:
MI_data.head(5)

Unnamed: 0,protocol_id,protocol_start_char,protocol_end_char,type,text
11086,acute_myocardial_infarction_0,0,27,PROBLEM,Acute myocardial infarction
11087,acute_myocardial_infarction_0,31,50,PROBLEM,myocardial necrosis
11088,acute_myocardial_infarction_0,66,104,PROBLEM,acute obstruction of a coronary artery
11089,acute_myocardial_infarction_0,106,114,PROBLEM,Symptoms
11090,acute_myocardial_infarction_0,123,139,PROBLEM,chest discomfort


In [31]:
# Apply the function to each row and store the results in a new column
MI_data['query_results'] = MI_data.apply(lambda row: apply_query_repository(row, "MI"), axis=1)

print(MI_data[['text', 'query_results']])

                                           text  \
11086               Acute myocardial infarction   
11087                       myocardial necrosis   
11088    acute obstruction of a coronary artery   
11089                                  Symptoms   
11090                          chest discomfort   
...                                         ...   
12775                             beta-blockers   
12776  angiotensin-converting enzyme inhibitors   
12777                                   statins   
12778                              antiplatelet   
12779                                   statins   

                                           query_results  
11086  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
11087  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
11088  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
11089  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
11090  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
...                              

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MI_data['query_results'] = MI_data.apply(lambda row: apply_query_repository(row, "MI"), axis=1)


In [32]:
MI_data['query_results'] = MI_data['query_results'].apply(extract_value_from_json)

print(MI_data[['text', 'query_results']])

                                           text         query_results
11086               Acute myocardial infarction        MedicalTherapy
11087                       myocardial necrosis      MedicalCondition
11088    acute obstruction of a coronary artery          MedicalCause
11089                                  Symptoms  MedicalSignOrSymptom
11090                          chest discomfort        MedicalSymptom
...                                         ...                   ...
12775                             beta-blockers             DrugClass
12776  angiotensin-converting enzyme inhibitors                      
12777                                   statins             DrugClass
12778                              antiplatelet                  Drug
12779                                   statins             DrugClass

[1694 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MI_data['query_results'] = MI_data['query_results'].apply(extract_value_from_json)


In [34]:
file_path = 'mi_with_kg_ner.tsv'

# Save the DataFrame to a TSV file
MI_data.to_csv(file_path, sep='\t', index=False)

### Ankle Fractures

In [35]:
ankle_fractures_data = data[data['protocol_id'].str.startswith("ankle_fractures")]

In [38]:
# Apply the function to each row and store the results in a new column
ankle_fractures_data['query_results'] = ankle_fractures_data.apply(lambda row: apply_query_repository(row, "AnkleFractures"), axis=1)

print(ankle_fractures_data[['text', 'query_results']])

                    text                                      query_results
1203     Ankle fractures  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1204     These fractures  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1205              x-rays  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1206                 MRI  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1207      a walking boot  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
...                  ...                                                ...
1433           fractures  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1434           fractures  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1435     ankle fractures  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1436  unstable fractures  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1437                ORIF  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...

[235 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ankle_fractures_data['query_results'] = ankle_fractures_data.apply(lambda row: apply_query_repository(row, "AnkleFractures"), axis=1)


In [39]:
ankle_fractures_data['query_results'] = ankle_fractures_data['query_results'].apply(extract_value_from_json)

print(ankle_fractures_data[['text', 'query_results']])

                    text     query_results
1203     Ankle fractures  MedicalCondition
1204     These fractures                  
1205              x-rays       MedicalTest
1206                 MRI                  
1207      a walking boot                  
...                  ...               ...
1433           fractures  MedicalCondition
1434           fractures  MedicalCondition
1435     ankle fractures  MedicalCondition
1436  unstable fractures  MedicalProcedure
1437                ORIF  MedicalProcedure

[235 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ankle_fractures_data['query_results'] = ankle_fractures_data['query_results'].apply(extract_value_from_json)


In [40]:
file_path = 'ankle_fractures_with_kg_ner.tsv'

# Save the DataFrame to a TSV file
ankle_fractures_data.to_csv(file_path, sep='\t', index=False)

In [50]:
import pandas as pd
def merge_tsv_files(output_file_path):
    # Read each TSV file into a DataFrame and store them in a list
    dataframes = [MI_data, diabetes_mellitus_data, ankle_fractures_data]
    
    # Concatenate all the DataFrames into one
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Save the combined DataFrame to a new TSV file
    combined_df.to_csv(output_file_path, sep='\t', index=False)
    
    print(f'Combined DataFrame saved to {output_file_path}')

# Example usage
output_file_path = 'ner_with_kg.tsv'  # Desired path for the combined TSV file
merge_tsv_files(output_file_path)

Combined DataFrame saved to ner_with_kg.tsv


### All in one

In [26]:
protocols = ['acute_bacterial_meningitis', 'ankle_fractures', 'hypertension', 'diabetes_mellitus', 'distal_humeral_fractures', 'hypothyroidism', 'influenza', 'intracerebral_hemorrhage', 'ischemic_stroke', 'acute_myocardial_infarction']

In [27]:
for protocol in protocols:
    extracted_df = ankle_fractures_data = data[data['protocol_id'].str.startswith(protocol)]
    # Apply the function to each row and store the results in a new column
    extracted_df['query_results'] = ankle_fractures_data.apply(lambda row: apply_query_repository(row, protocol), axis=1)

    print(extracted_df[['text', 'query_results']])
    extracted_df['query_results'] = extracted_df['query_results'].apply(extract_value_from_json)

    print(extracted_df[['text', 'query_results']])
    file_path = f'{protocol}_with_kg_ner.tsv'

    # Save the DataFrame to a TSV file
    extracted_df.to_csv(file_path, sep='\t', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = ankle_fractures_data.apply(lambda row: apply_query_repository(row, protocol), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = extracted_df['query_results'].apply(extract_value_from_json)


                                                   text  \
0                            Acute bacterial meningitis   
1     rapidly progressive bacterial infection of the...   
2                                              headache   
3                                                 fever   
4                                       nuchal rigidity   
...                                                 ...   
1198                                    N. meningitidis   
1199                                         meningitis   
1200                                       H. influenza   
1201                                         meningitis   
1202                                         meningitis   

                                          query_results  
0     {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
1     {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
2     {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
3     {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = ankle_fractures_data.apply(lambda row: apply_query_repository(row, protocol), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = extracted_df['query_results'].apply(extract_value_from_json)


                    text                                      query_results
1203     Ankle fractures  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1204     These fractures  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1205              x-rays  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1206                 MRI  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1207      a walking boot  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
...                  ...                                                ...
1433           fractures  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1434           fractures  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1435     ankle fractures  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1436  unstable fractures  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...
1437                ORIF  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...

[235 rows x 2 columns]
                    text     query_results
1203     Ankle fractu

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = ankle_fractures_data.apply(lambda row: apply_query_repository(row, protocol), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = extracted_df['query_results'].apply(extract_value_from_json)


                                 text  \
1438                     Hypertension   
1439              sustained elevation   
1440  resting systolic blood pressure   
1441         diastolic blood pressure   
1442                     Hypertension   
...                               ...   
3166                         thiazide   
3167                   angiotensin II   
3168                  dihydropyridine   
3169                          calcium   
3170                     hypertension   

                                          query_results  
1438  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
1439  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
1440  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
1441  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
1442  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
...                                                 ...  
3166  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
3167  {\n  "head" : {\n    "vars" : [\n    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = ankle_fractures_data.apply(lambda row: apply_query_repository(row, protocol), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = extracted_df['query_results'].apply(extract_value_from_json)


                                               text  \
3171                              Diabetes mellitus   
3172                     impaired insulin secretion   
3173                  peripheral insulin resistance   
3174                                  hyperglycemia   
3175                                 Early symptoms   
...                                             ...   
5945  renin-angiotensin-aldosterone system blockers   
5946                                        statins   
5947                                  complications   
5948                  renin-angiotensin-aldosterone   
5949                                        statins   

                                          query_results  
3171  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
3172  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
3173  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
3174  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
3175  {\n  "head" : {\n    "vars" : [\n      "o"\

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = ankle_fractures_data.apply(lambda row: apply_query_repository(row, protocol), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = extracted_df['query_results'].apply(extract_value_from_json)


                          text  \
5950  Distal humeral fractures   
5951                    a fall   
5952      neurovascular injury   
5953                   humeral   
5954             neurovascular   
...                        ...   
6204         anterior fat pads   
6205     anterior humeral line   
6206      radiocapitellar line   
6207                  fracture   
6208                 treatment   

                                          query_results  
5950  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
5951  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
5952  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
5953  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
5954  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
...                                                 ...  
6204  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
6205  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
6206  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
6207  {\n  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = ankle_fractures_data.apply(lambda row: apply_query_repository(row, protocol), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = extracted_df['query_results'].apply(extract_value_from_json)


                                 text  \
6209                   Hypothyroidism   
6210       thyroid hormone deficiency   
6211                         Symptoms   
6212                 cold intolerance   
6213                          fatigue   
...                               ...   
7067                    Myxedema coma   
7068  a life-threatening complication   
7069                        treatment   
7070                             coma   
7071                    Myxedema coma   

                                          query_results  
6209  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
6210  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
6211  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
6212  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
6213  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
...                                                 ...  
7067  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
7068  {\n  "head" : {\n    "vars" : [\n    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = ankle_fractures_data.apply(lambda row: apply_query_repository(row, protocol), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = extracted_df['query_results'].apply(extract_value_from_json)


                               text  \
7072                      Influenza   
7073  a viral respiratory infection   
7074                          fever   
7075                         coryza   
7076                          cough   
...                             ...   
7936      different influenza types   
7937                different drugs   
7938                antiviral drugs   
7939                    vaccination   
7940                    vaccination   

                                          query_results  
7072  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
7073  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
7074  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
7075  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
7076  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
...                                                 ...  
7936  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
7937  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
7938  {\n

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = ankle_fractures_data.apply(lambda row: apply_query_repository(row, protocol), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = extracted_df['query_results'].apply(extract_value_from_json)


                                        text  \
7941                Intracerebral hemorrhage   
7942                          focal bleeding   
7943  a blood vessel in the brain parenchyma   
7944                            hypertension   
7945                        Typical symptoms   
...                                      ...   
8586        increased intracerebral pressure   
8587                             ventricular   
8588                           intracerebral   
8589                           hydrocephalus   
8590                           hydrocephalus   

                                          query_results  
7941  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
7942  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
7943  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
7944  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
7945  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
...                                                 ...  
8586  {\n  "head"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = ankle_fractures_data.apply(lambda row: apply_query_repository(row, protocol), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = extracted_df['query_results'].apply(extract_value_from_json)


                             text  \
8591              Ischemic stroke   
8592   sudden neurologic deficits   
8593      focal cerebral ischemia   
8594   permanent brain infarction   
8595       diffusion-weighted MRI   
...                           ...   
11081              endarterectomy   
11082                    stenting   
11083            ischemic strokes   
11084                      statin   
11085            ischemic strokes   

                                           query_results  
8591   {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
8592   {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
8593   {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
8594   {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
8595   {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
...                                                  ...  
11081  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
11082  {\n  "head" : {\n    "vars" : [\n      "o"\n  ...  
11083  {\n  "head" : {\n

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = ankle_fractures_data.apply(lambda row: apply_query_repository(row, protocol), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_df['query_results'] = extracted_df['query_results'].apply(extract_value_from_json)


In [None]:
import pandas as pd
def merge_tsv_files(output_file_path):
    # Read each TSV file into a DataFrame and store them in a list
    dataframes = [MI_data, diabetes_mellitus_data, ankle_fractures_data]
    
    # Concatenate all the DataFrames into one
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Save the combined DataFrame to a new TSV file
    combined_df.to_csv(output_file_path, sep='\t', index=False)
    
    print(f'Combined DataFrame saved to {output_file_path}')

# Example usage
output_file_path = 'ner_with_kg.tsv'  # Desired path for the combined TSV file
merge_tsv_files(output_file_path)

In [28]:
# Re-importing necessary libraries after reset
import os
import pandas as pd


def merge_tsv_files_from_directory(output_file_path):
    dataframes = []
    
    current_directory = './results'
    
    # Get all .tsv files in the specified directory
    tsv_files = [file for file in os.listdir(current_directory) if file.endswith('.tsv')]
    
    # Read each TSV file into a DataFrame and append to the list
    for file in tsv_files:
        df = pd.read_csv(os.path.join(current_directory, file), sep='\t')
        dataframes.append(df)
    
    # Concatenate all the DataFrames into one
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Save the combined DataFrame to a new TSV file
    combined_df.to_csv(output_file_path, sep='\t', index=False)
    
    print(f'Combined DataFrame saved to {output_file_path}')

# Example usage
output_file_path = './ner_with_kg_combined_stanza.tsv'  # Desired path for the combined TSV file
merge_tsv_files_from_directory(output_file_path)


Combined DataFrame saved to ./ner_with_kg_combined.tsv
