Skip to content

Commit

Permalink
made scraping script parallel, updated readme and gitignore
Browse files Browse the repository at this point in the history
  • Loading branch information
Cyprian Gascoigne committed Nov 7, 2018
1 parent 18da97c commit afd58be
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 16 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Expand Up @@ -5,6 +5,6 @@ data/*
# certain data files should be shared
!data/mxm_mappings.csv
!data/no_lyrics.csv
!data/lyrics
# We can't upload the data to github
# to make sure that everyone has a log dir
!logs/.gitkeep
!logs/.gitkeep
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -62,6 +62,7 @@ Then, run through the following commands:
- Windows: `.venv_w266_project\Scripts\activate.bat`
- Linux: `source .venv_w266_project/bin/activate`
- `pip install -r requirements.txt` - this will install all required packages and might take several minutes
- `python scrape_lyrics.py -t a & python scrape_lyrics.py -t b` to run in paralell. use `fg` to switch between processes so you can quite with ^C

## Downloading Data

Expand Down
22 changes: 11 additions & 11 deletions batch_scraping.sh
Expand Up @@ -6,9 +6,9 @@
# 'y', 'Ä', 'l', 'k', 'n', 'f', 'c', 'h', 'e', 'x', 'r', '(', 'b',
# '.', '-', 'ü', 'Â', 'g', '+', 'Ó', 'v', 'µ', 'Í', '$', '!', 'Ö',
# '0', 'u', 'w', 'o', '#', ':', 'Æ', '}', '\x1c'], dtype=object)
# python scrape_lyrics.py -a a
# python scrape_lyrics.py -a b
# python scrape_lyrics.py -a c
python3 scrape_lyrics.py -a a
python3 scrape_lyrics.py -a b
python3 scrape_lyrics.py -a c
# python scrape_lyrics.py -a d
# python scrape_lyrics.py -a e
# python scrape_lyrics.py -a g
Expand All @@ -17,13 +17,13 @@
# python scrape_lyrics.py -a j
# python scrape_lyrics.py -a k
# python scrape_lyrics.py -a l
python scrape_lyrics.py -a m
python scrape_lyrics.py -a n
python scrape_lyrics.py -a o
python scrape_lyrics.py -a p
python scrape_lyrics.py -a q
python scrape_lyrics.py -a r
python scrape_lyrics.py -a s
# python scrape_lyrics.py -a m
# python scrape_lyrics.py -a n
# python scrape_lyrics.py -a o
# python scrape_lyrics.py -a p
# python scrape_lyrics.py -a q
# python scrape_lyrics.py -a r
# python scrape_lyrics.py -a s
# python scrape_lyrics.py -a t
# python scrape_lyrics.py -a u
# python scrape_lyrics.py -a v
Expand Down Expand Up @@ -54,4 +54,4 @@ python scrape_lyrics.py -a s
# python scrape_lyrics.py -a \!
# python scrape_lyrics.py -a \#
# python scrape_lyrics.py -a \:
# python scrape_lyrics.py -a \}
# python scrape_lyrics.py -a \}
7 changes: 4 additions & 3 deletions scrape_lyrics.py
Expand Up @@ -153,6 +153,7 @@ def scrape_lyrics(artist_name_starts_with):
csvwriter.writerow(CSV_HEADER)
df_no_lyrics = pd.read_csv(CSV_NO_LYRICS, encoding='utf-8')
df_no_lyrics = df_no_lyrics.sort_values('msd_artist')
df_no_lyrics_update = pd.DataFrame(columns = CSV_HEADER)

song_index = 0
songs_skipped = 0
Expand Down Expand Up @@ -193,8 +194,8 @@ def scrape_lyrics(artist_name_starts_with):
if not song:
# no luck... on to the next one
# https://stackoverflow.com/questions/24284342/insert-a-row-to-pandas-dataframe/24287210
df_no_lyrics.loc[-1] = row
df_no_lyrics.index += 1
df_no_lyrics_update.loc[-1] = row
df_no_lyrics_update.index += 1
logger.debug('{0}: No luck (artist={1}, title={2}). Saved to no lyrics csv.'.format(song_index, row['msd_artist'], row['mxm_artist']))
continue

Expand All @@ -214,7 +215,7 @@ def scrape_lyrics(artist_name_starts_with):
logger.info(kbi)

logger.info('saving no lyrics csv...')
df_no_lyrics.to_csv(CSV_NO_LYRICS, encoding='utf-8', index=False)
df_no_lyrics_update.to_csv(CSV_NO_LYRICS, encoding='utf-8', index=False, mode='a', header=False)
logger.info('done.')

end = time.time()
Expand Down

0 comments on commit afd58be

Please sign in to comment.