In [1]:
import json
import pandas as pd

In [2]:
df = pd.read_json('all_raw.json')

In [3]:
df['better_genre'] = df['genre']

In [4]:
df[['genre', 'better_genre', 'title']]

Unnamed: 0,genre,better_genre,title
0,abstract,abstract,An Abstract of A Treatise of Human Nature
1,dialogues,dialogues,Pamphilus to Hermippus
2,dialogues,dialogues,Part 1
3,dialogues,dialogues,Part 2
4,dialogues,dialogues,Part 3
...,...,...,...
272,treatise,treatise,SECTION II: OF GREATNESS OF MIND
273,treatise,treatise,SECTION III: OF GOODNESS AND BENEVOLENCE
274,treatise,treatise,SECTION IV: OF NATURAL ABILITIES
275,treatise,treatise,SECTION V: SOME FURTHER REFLECTIONS CONCERNING...


### Update genres according to the [sheets](https://docs.google.com/spreadsheets/d/1X5IdzFhdN8_d7km_vE9wQva9csbVFbyqjOt-trZPj7U/edit?gid=1489902570#gid=1489902570)

In [5]:
df.loc[df.genre == 'enquiry PoM', 'better_genre'] = 'enquiry'
df.loc[df.genre == 'enquiry HU', 'better_genre'] = 'enquiry'
df.loc[df.genre == 'natural history', 'better_genre'] = 'dissertation'

In [6]:
df.loc[df.title == 'Essay 22: Of Tragedy', 'better_genre'] = 'dissertation'
df.loc[df.title == 'Essay 23: Of the Standard of Taste', 'better_genre'] = 'dissertation'

df.loc[df.title == 'Essay 1: Of Commerce', 'better_genre'] = 'political discourses'
df.loc[df.title == 'Essay 2: Of Refinement in the Arts', 'better_genre'] = 'political discourses'
df.loc[df.title == 'Essay 3: Of Money', 'better_genre'] = 'political discourses'
df.loc[df.title == 'Essay 4: Of Interest', 'better_genre'] = 'political discourses'
df.loc[df.title == 'Essay 5: Of the Balance of Trade', 'better_genre'] = 'political discourses'
df.loc[df.title == 'Essay 7: Of the Balance of Power', 'better_genre'] = 'political discourses'
df.loc[df.title == 'Essay 8: Of Taxes', 'better_genre'] = 'political discourses'
df.loc[df.title == 'Essay 9: Of Public Credit', 'better_genre'] = 'political discourses'
df.loc[df.title == 'Essay 10: Of Some Remarkable Customs', 'better_genre'] = 'political discourses'
df.loc[df.title == 'Essay 11: Of the Populousness of Ancient Nations', 'better_genre'] = 'political discourses'
df.loc[df.title == 'Essay 15: Of the Protestant Succession', 'better_genre'] = 'political discourses'
df.loc[df.title == 'Essay 16: Idea of a Perfect Commonwealth', 'better_genre'] = 'political discourses'

In [7]:
df = df.drop(columns=['genre'])

In [8]:
df = df.rename(columns={'better_genre':'genre'})

In [9]:
set(df['genre'])

{'abstract',
 'dialogues',
 'dissertation',
 'enquiry',
 'essay',
 'history',
 'letter',
 'political discourses',
 'treatise'}

### Add integer labels for genres

In [10]:
labels_list = ['dialogues', 'dissertation', 'enquiry', 'essay', 
               'history', 'political discourses', 'treatise', 'letter', 
               'abstract']

labels_dict = {'dialogues': 0,
               'dissertation':1,
               'enquiry': 2,
               'essay': 3, 
               'history': 4,
               'political discourses': 5,
               'treatise': 6,
               'letter': 7,
               'abstract': 8}

def apply_label(value):
    return labels_dict[value]

df['labels'] = df['genre'].apply(apply_label)

In [11]:
set(df['labels'])

{0, 1, 2, 3, 4, 5, 6, 7, 8}

In [12]:
df.to_json('all_updGenres.json', orient='records')

### Each paragraph to its own line

In [13]:
# easier to do in json
with open('all_updGenres.json', 'r') as file:
    data = json.load(file)

In [14]:
jdict = []

for row in data:
    title = row['title']
    genre = row['genre']
    label = row['labels']
    for par in row['text']:
        small_dict = {'title':title,
                      'genre':genre,
                      'text':par,
                      'labels':label}
        jdict.append(small_dict)

In [15]:
len(jdict)

8576

In [16]:
jdict[2000]

{'title': 'Essay 16: Idea of a Perfect Commonwealth',
 'genre': 'political discourses',
 'text': 'In the commonwealth, no representative, magistrate, or senator, as such, has any salary. The protector, secretaries, councils, and ambassadors, have salaries.',
 'labels': 5}

In [17]:
with open('all.json', 'w') as file:
    json.dump(jdict, file, indent=3)