forked from vinneysong/Chinese-Poem-Generate-Based-on-GPT2
/
LoadData.py
56 lines (48 loc) · 1.57 KB
/
LoadData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from tqdm import tqdm
import pandas as pd
import json
# with open('dataset/poetryTang/poetryTang.txt', 'r', encoding='utf-8') as handler:
# lines = handler.read().split('\n')
#
# data = list()
# for line in tqdm(lines):
# sp = line.split('::')
# if len(sp) != 3:
# print("Error: ", sp)
# continue
# data.append(sp)
# train = pd.DataFrame(data)
# train.columns = ['title', 'author', 'content']
# train['keywords']=['']*len(train)
# train['dynasty']=['Tang']*len(train)
# train.to_csv('dataset/poetryTang/poetryTang.csv',columns=['title', 'author', 'content','keywords','dynasty'],
# sep='\t',
# index=False)
json_list = []
with open('./dataset/CCPC/ccpc_train_v1.0.json','r',encoding='utf-8')as fp:
for line in fp.readlines():
json_list.append(json.loads(line))
data = [[d['dynasty'],d['author'],d['content'].replace('|',',')+'。',d['title'],d['keywords']] for d in json_list]
train = pd.DataFrame(data,columns = ['dynasty','author','content','title','keywords'])
train.head()
train.to_csv('dataset/CCPC/CCPC.csv',columns=['title', 'author', 'content','keywords','dynasty'],
sep='\t',
index=False)
import matplotlib.pyplot as plt
texts = train['content'].tolist()
dict = {}
for i in range(len(texts)):
l = len(texts[i])
if l not in dict:
dict[l] = 1
else:
dict[l] = dict[l] + 1
d = sorted(dict.items(), key=lambda k: k[0])
x = []
y = []
for i in range(len(d)):
x.append(d[i][0])
y.append(d[i][1])
plt.bar(x, y)
plt.savefig('CCPC.png')
plt.show()