In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [72]:
df = pd.read_csv("covid_dataset.csv")

In [73]:
df.columns

Index(['Unnamed: 0', 'id', 'text', 'politics', 'economy', 'foreign', 'culture',
       'situation', 'measures', 'racism', 'overall'],
      dtype='object')

In [74]:
df = df.drop(['Unnamed: 0'], axis=1)

In [75]:
df.head()

Unnamed: 0,id,text,politics,economy,foreign,culture,situation,measures,racism,overall
0,1235951163427688448,"""Every time we intubate a patient, we have no ...",0,0,0,0,0,3,0,3
1,1220784346983911424,"""In 2003, we built Xiaotangshan Hospital in 7 ...",0,0,0,0,0,3,0,3
2,1224530700965015552,"""Our nations should never allow China intrinsi...",0,0,0,0,0,0,0,1
3,1236593587908947968,"""He's not dead. His hands and feet were still ...",0,0,0,0,1,0,0,1
4,1230198133893799936,"""....disregarding the facts and becoming a par...",0,0,0,0,0,0,0,2


In [76]:
aspects = ['politics', 'economy', 'foreign', 'culture','situation', 'measures', 'racism', 'overall']
attribute_dict = {0:"unrelated", 1:"negative", 2:"neutral", 3:"positive"}

In [77]:
df_out = pd.DataFrame(columns=['id', 'attributes', 'aspect', 'text'])

In [78]:
df_out

Unnamed: 0,id,attributes,aspect,text


In [79]:
for row in df.iterrows(): 
    print(row[1])
    break

id                                         1235951163427688448
text         "Every time we intubate a patient, we have no ...
politics                                                     0
economy                                                      0
foreign                                                      0
culture                                                      0
situation                                                    0
measures                                                     3
racism                                                       0
overall                                                      3
Name: 0, dtype: object


In [80]:
aspects = ['politics', 'economy', 'foreign', 'culture','situation', 'measures', 'racism', 'overall']
aspects_idx = [2,3,4,5,6,7,8,9]
aspect_idx_dict = dict(zip(aspects,aspects_idx))
aspect_idx_dict


{'politics': 2,
 'economy': 3,
 'foreign': 4,
 'culture': 5,
 'situation': 6,
 'measures': 7,
 'racism': 8,
 'overall': 9}

In [81]:
for row in df.iterrows(): 
    for aspect in aspects:
        tmp_dict = {}
        tmp_dict['id'] = row[1][0]
        tmp_dict['attributes'] = attribute_dict[row[1][aspect_idx_dict[aspect]]]
        tmp_dict['aspect'] = aspect 
        tmp_dict['text'] = row[1][1]
        print(tmp_dict)
    break

{'id': 1235951163427688448, 'attributes': 'unrelated', 'aspect': 'politics', 'text': '"Every time we intubate a patient, we have no option of failure."\n\nMeet the intubation team in Wuhan who have a high risk of being infected but still spare no efforts to save patients\' lives from #COVID19 https://t.co/J8yRci7Pxw'}
{'id': 1235951163427688448, 'attributes': 'unrelated', 'aspect': 'economy', 'text': '"Every time we intubate a patient, we have no option of failure."\n\nMeet the intubation team in Wuhan who have a high risk of being infected but still spare no efforts to save patients\' lives from #COVID19 https://t.co/J8yRci7Pxw'}
{'id': 1235951163427688448, 'attributes': 'unrelated', 'aspect': 'foreign', 'text': '"Every time we intubate a patient, we have no option of failure."\n\nMeet the intubation team in Wuhan who have a high risk of being infected but still spare no efforts to save patients\' lives from #COVID19 https://t.co/J8yRci7Pxw'}
{'id': 1235951163427688448, 'attributes': 

In [82]:
train, dev_test = train_test_split(df, test_size=0.1)
dev, ttest = train_test_split(dev_test, test_size=0.5)
train.shape, dev.shape, test.shape

((4770, 10), (265, 10), (266, 10))

In [83]:
train_list = []

for row in train.iterrows(): 
    for aspect in aspects:
        tmp_dict = {}
        tmp_dict['id'] = row[1][0]
        tmp_dict['attributes'] = attribute_dict[row[1][aspect_idx_dict[aspect]]]
        tmp_dict['aspect'] = aspect 
        tmp_dict['text'] = row[1][1].replace('\t',' ').replace('\n',' ').replace('\r',' ')
        train_list.append(tmp_dict)
train_out = pd.DataFrame(train_list)[['id','attributes','aspect','text']]
    

In [84]:
train_out.shape

(38160, 4)

In [85]:
dev_list = []

for row in test.iterrows(): 
    for aspect in aspects:
        tmp_dict = {}
        tmp_dict['id'] = row[1][0]
        tmp_dict['attributes'] = attribute_dict[row[1][aspect_idx_dict[aspect]]]
        tmp_dict['aspect'] = aspect 
        tmp_dict['text'] = row[1][1].replace('\t',' ').replace('\n',' ').replace('\r',' ')
        dev_list.append(tmp_dict)
dev_out = pd.DataFrame(dev_list)[['id','attributes','aspect','text']]
dev_out.shape

(2128, 4)

In [86]:
test_list = []

for row in test.iterrows(): 
    for aspect in aspects:
        tmp_dict = {}
        tmp_dict['id'] = row[1][0]
        tmp_dict['attributes'] = attribute_dict[row[1][aspect_idx_dict[aspect]]]
        tmp_dict['aspect'] = aspect 
        tmp_dict['text'] = row[1][1].replace('\t',' ').replace('\n',' ').replace('\r',' ')
        test_list.append(tmp_dict)
test_out = pd.DataFrame(test_list)[['id','attributes','aspect','text']]
test_out.shape

(2128, 4)

In [69]:
train_out.to_csv("./data/train_NLI_M.csv", sep='\t', index=False, header=False)
dev_out.to_csv("./data/test_NLI_M.csv", sep='\t', index=False, header=False)
test_out.to_csv("./data/test_NLI_M.csv", sep='\t', index=False, header=False)

In [70]:
train_out.loc[28097]['text']

'The sky in Wuhan on Friday is steel-grey, and it’s melancholy day after the death of a doctor who tried to warn of the virus that he died from. All the updates are on our live briefing, and I’ll be asking residents about the news. https://t.co/fkfIe3OoXH https://t.co/Yn9Ku0nm4H'