# Google Drive connection

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth',None)   #this displays the dataframe in full width

# Reading the 4 datasets into one dataframe

In [None]:
df  = pd.DataFrame() #empty Dataframe

dataset_name = ['headache_07_12','headaches_07_12','migraine_07_12','ndph_07_12']
print("STARTING TO READ DATASETS\n")
for name in dataset_name:
  url = '/content/drive/MyDrive/Reddit Migraine Dataset/'+name+'.xlsx'
  print("**********************************\n")
  print("Reading Dataset:",name)
  df_temp = pd.read_excel(url)
  print("length of Dataset is:",len(df_temp),"\n")
  df_temp = df_temp.drop(['Unnamed: 0','flair_text','date','domain','author', 'score'],axis=1)
  df = pd.concat([df, df_temp], axis=0)

df = df.reset_index(drop=True)
print("\nALL DATASETS READING FINISHED")

STARTING TO READ DATASETS

**********************************

Reading Dataset: headache_07_12
length of Dataset is: 521 

**********************************

Reading Dataset: headaches_07_12
length of Dataset is: 548 

**********************************

Reading Dataset: migraine_07_12
length of Dataset is: 2324 

**********************************

Reading Dataset: ndph_07_12
length of Dataset is: 232 


ALL DATASETS READING FINISHED


In [None]:
df.shape

(3625, 3)

# Operations performed on dataframe:
1. if there is no 'text', then the title is copied into the text
2. Duplicate rows are removed from the dataframe

In [None]:
# Removal of NaN values
for i in range(len(df)):
  if (df['text'][i]!=df['text'][i]):
    df['text'][i] = df['title'][i]

In [None]:
df = df.drop('title',axis=1)

In [None]:
df.duplicated(keep=False).sum()

1218

In [None]:
df = df.drop_duplicates(keep='first')

In [None]:
df = df.reset_index(drop=True)

In [None]:
# Look, Ma! no duplicates.
df.duplicated(keep=False).sum()

0

In [None]:
df.shape

(2985, 2)

In [None]:
df.empty #To see if there is an empty cell

False

In [None]:
len(df[df['text'].isna()])  # To see if there are NaN in cells

0

In [None]:
len(df['id'].unique()) # this no matches with the total df size. So dataframe is looking good so far!

2985

In [None]:
df.head(2)

In [None]:
df.shape

(2985, 2)

# Keywords-based Filtering

The entries which do not contain these keywords will be dropped from the dataset

In [None]:
keywords = ['Migraine','Triptan','Sumatriptan','Imitrex','Rizatriptan','Maxalt','Naratriptan','Amerge','Eletriptan','Relpax','Zolmitriptan','Zomig','Frovatriptan','Frova','Almotriptan','Axert','Erenumab','Aimovig','Galcanezumab','Emgality','Fremanezumab','Ajovy','Eptinezumab','Vyepti','Ubrogepant','Ubrelvy','Rimegepant','Nurtec','Atogepant','Qulipta']

In [None]:
index_to_delete = []
for i in range(len(df)):
  if any(word.lower() in df['text'][i].lower() for word in keywords):
    continue

  else:
    index_to_delete.append(i)

In [None]:
len(index_to_delete)

1476

In [None]:
for i in index_to_delete:
    df = df.drop(i)

In [None]:
df.shape

(1509, 2)

In [None]:
2985-1476 # data in original df - keywords matched df

1509

# Splitting dataset into Validation & Test

In this section, we are validating a trained model (on twitter data). Test dataset is manually annotated by medical practioners

In [None]:
df = df.reset_index(drop=True)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
Validation , Test = train_test_split(df,test_size=0.2)

In [None]:
Validation.shape

(1207, 10)

In [None]:
Test.shape

(302, 2)

# Reset Validation & Test Dataset indices. Save the 'Validation' & 'Test' in xlsx

In [None]:
Validation.head()

In [None]:
Test.head()

In [None]:
Validation = Validation.reset_index(drop=True)

In [None]:
Test = Test.reset_index(drop=True)

In [None]:
Test.to_excel(r'Reddit_test_dataset.xlsx', index = False) #Exporting the test dataset for medical practitioners to manually annotate it

In [None]:
Validation.to_excel(r'/content/drive/MyDrive/Reddit_validation_dataset.xlsx', index = False)

# Few Amendments
In the previous sections, I had removed the text that did not contain keywords. But our goal is to see how pre-trained model would classify for real dataset. So, do not remove the text.

The data has already been split into validation & test. Test dataset is shared with medical practitioners for manual annotation. So, I am going to adopt a work-around. I will split the (2985-1509) data into validation and test. This way, the previous dataset remains untouched.



In [None]:
2985-1509

1476

In [None]:
# I ran only few cells from top to the point where df does not contain duplicates & NaN
df.shape

(2985, 2)

In [None]:
# Let's store it in a new dataframe
df_all_data = df

In [None]:
df_all_data.head()

In [None]:
# df here is the keywords based filtering.
df.shape

(1509, 2)

In [None]:
df_filtered = df

In [None]:
df_filtered = df_filtered.reset_index(drop=True)

In [None]:
df_filtered.head()

In [None]:
i1 = pd.MultiIndex.from_frame(df_all_data)
i2 = pd.MultiIndex.from_frame(df_filtered)
df_difference = df_all_data[~i1.isin(i2)]

In [None]:
# This length should be 1476 (=2985-1509)
len(df_difference)

1476

In [None]:
df_difference

In [None]:
df_difference = df_difference.reset_index(drop=True)

# Splitting the df_difference into two dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
Validation_diff , Test_diff = train_test_split(df_difference,test_size=0.2)

In [None]:
Validation_diff.shape

(1180, 2)

In [None]:
Test_diff.shape

(296, 2)

In [None]:
302+296

598

In [None]:
1180+296

1476

In [None]:
Validation_diff = Validation_diff.reset_index(drop=True)
Test_diff = Test_diff.reset_index(drop=True)

In [None]:
# Save both datasets for future references
Test_diff.to_excel(r'/content/drive/MyDrive/Reddit Migraine Dataset/2985 Dataset/Reddit_test_diff_dataset.xlsx', index = False) #Exporting the test dataset for medical practitioners to manually annotate it
Validation_diff.to_excel(r'/content/drive/MyDrive/Reddit Migraine Dataset/2985 Dataset/Reddit_validation_diff_dataset.xlsx', index = False)