-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_dataset.py
140 lines (128 loc) · 5.75 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding: utf-8 -*-
import re
from time import sleep
import requests
from bs4 import BeautifulSoup
def is_english(s):
try:
s.encode('ascii')
except UnicodeEncodeError:
return False
else:
return True
def is_float(value):
try:
float(value)
return True
except ValueError:
return False
def flat_to_sharp(chord): # converts flat chords to their equivalent sharp chord
chord = flat_chords[chord[:2]].upper() + chord[2:]
return chord
def get_cookie():
with open('cookie.txt') as file:
for line in file:
return line
flat_chords = dict(Ab='G#', Bb='A#', Db='C#', Eb='D#', Gb='F#') # dictionary to convert flat chords to sharp chords
artist_url = 'https://kithara.to/ci/xatfgvracketa' # artist url
base_url = "https://kithara.to/ssbd.php?id=" # base url used by kithara.to to get lyrics and chords
artist_session = requests.Session() # starting a new session
cookies = dict(memberSession=get_cookie()) # login cookie, allows for unlimited pings
artist_data = artist_session.get(artist_url, cookies=cookies) # getting HTML artist page
artist_text = artist_data.text # scraping text
artist_soup = BeautifulSoup(artist_text) # initialising BS4
tables = artist_soup.find_all("table", class_="thist2col") # songs are stored on a table
titles = dict()
song_table = tables[1] # the first table is just the top songs, we want all of them
for a_tag in song_table.find_all('a'):
title_tag = a_tag['title']
link = a_tag['href']
title_elements = re.findall('\[[^\]]*\]|\([^\)]*\)|\"[^\"]*\"|\S+', title_tag) # splitting title
specifications = re.search('\(([^)]+)', title_elements.pop()).group(1).split(',') # song specs from parentheses
title = ' '.join(title_elements).upper() # capitalising every title
title = re.sub(r" ?\([^)]+\)", "", title)
print("Specs: " + str(specifications) + " Title: " + title)
current_rating = specifications[-1]
if is_float(current_rating):
current_rating = float(current_rating)
else:
current_rating = 0
if title in titles:
if titles[title]['rating'] < current_rating:
titles[title]['rating'] = current_rating
titles[title]['link'] = link
print("updated")
else:
titles[title] = {}
titles[title]['rating'] = current_rating
titles[title]['link'] = link
song_ids = [titles[song]['link'].split("/")[2] for song in titles]
print(len(song_ids))
chords = dict() # distinct chords will be stored here
edges = [] # edges of the graph aka chord transitions will be stored here
id = 0 # id variable for chords
previous_chord = None
counter = 1
for song_id in song_ids: # going through every single song
counter += 1
session = requests.Session() # starting a new session
data = session.get(base_url + song_id, cookies=cookies) # getting song page data with a login cookie
text = data.text # scraping text
soup = BeautifulSoup(text, 'lxml') # initialising BS4
divs = soup.find_all("div", class_="ch") # ch class is where chords are stored
for div in divs:
divtext = div.text
line = divtext.split() # splitting chords
for chord in line:
chord = chord.capitalize()
if chord in chords:
# TODO
try:
if previous_chord != chord: # checking if a chord transition took place
edges.append([chords[previous_chord], chords[chord], 'Directed']) # appending new edge
previous_chord = chord
pass
except Exception as e:
print(e)
else:
try:
if chord[:1].isalpha() and is_english(chord): # checking if it is indeed a chord
if '(' in chord:
chord = chord.replace('(', '-').replace(')', '')
if '/' in chord:
chord = chord.replace('/', '-')
if '-' in chord:
clean_chords = chord.split('-')
print("New dirty chord " + chord)
for clean_chord in clean_chords:
clean_chord = clean_chord.title()
if 'b' in clean_chord:
clean_chord = flat_to_sharp(clean_chord)
if previous_chord != chord:
edges.append(
[chords[previous_chord], chords[clean_chord], 'Directed']) # appending new edge
previous_chord = clean_chord
continue
if 'b' in chord: # converting flat chord to sharp chord
chord = flat_to_sharp(chord)
if chord not in chords:
chords[chord] = id
id += 1
if previous_chord is not None: # useful when examining first chord
edges.append([chords[previous_chord], chords[chord], 'Directed'])
previous_chord = chord
except Exception as e:
print(e)
sleep(0.5)
print(chords)
nodes_csv_columns = 'id, label'
edges_csv_columns = 'source, target, type'
with open('nodes.csv', 'w') as csvfile:
csvfile.write(nodes_csv_columns + '\n')
for chord in chords:
csvfile.write(str(chords[chord]) + "," + chord + '\n')
with open('edges.csv', 'w') as csvfile:
csvfile.write(edges_csv_columns)
for edge in edges:
csvfile.write('\n')
csvfile.write(str(edge[0]) + ',' + str(edge[1]) + ',' + str(edge[2]))