made scraping script parallel, updated readme and gitignore

workmanjack · Nov 7, 2018 · afd58be · afd58be
1 parent 18da97c
commit afd58be
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 16 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,6 @@ data/*
 # certain data files should be shared
 !data/mxm_mappings.csv
 !data/no_lyrics.csv
-!data/lyrics
+# We can't upload the data to github
 # to make sure that everyone has a log dir
-!logs/.gitkeep
+!logs/.gitkeep
diff --git a/README.md b/README.md
@@ -62,6 +62,7 @@ Then, run through the following commands:
 - Windows: `.venv_w266_project\Scripts\activate.bat`
 - Linux: `source .venv_w266_project/bin/activate`
 - `pip install -r requirements.txt` - this will install all required packages and might take several minutes
+- `python scrape_lyrics.py -t a & python scrape_lyrics.py -t b` to run in paralell. use `fg` to switch between processes so you can quite with ^C
 
 ## Downloading Data
 

diff --git a/batch_scraping.sh b/batch_scraping.sh
@@ -6,9 +6,9 @@
 #       'y', 'Ä', 'l', 'k', 'n', 'f', 'c', 'h', 'e', 'x', 'r', '(', 'b',
 #       '.', '-', 'ü', 'Â', 'g', '+', 'Ó', 'v', 'µ', 'Í', '$', '!', 'Ö',
 #       '0', 'u', 'w', 'o', '#', ':', 'Æ', '}', '\x1c'], dtype=object)
-# python scrape_lyrics.py -a a
-# python scrape_lyrics.py -a b
-# python scrape_lyrics.py -a c
+python3 scrape_lyrics.py -a a
+python3 scrape_lyrics.py -a b
+python3 scrape_lyrics.py -a c
 # python scrape_lyrics.py -a d
 # python scrape_lyrics.py -a e
 # python scrape_lyrics.py -a g
@@ -17,13 +17,13 @@
 # python scrape_lyrics.py -a j
 # python scrape_lyrics.py -a k
 # python scrape_lyrics.py -a l
-python scrape_lyrics.py -a m
-python scrape_lyrics.py -a n
-python scrape_lyrics.py -a o
-python scrape_lyrics.py -a p
-python scrape_lyrics.py -a q
-python scrape_lyrics.py -a r
-python scrape_lyrics.py -a s
+# python scrape_lyrics.py -a m
+# python scrape_lyrics.py -a n
+# python scrape_lyrics.py -a o
+# python scrape_lyrics.py -a p
+# python scrape_lyrics.py -a q
+# python scrape_lyrics.py -a r
+# python scrape_lyrics.py -a s
 # python scrape_lyrics.py -a t
 # python scrape_lyrics.py -a u
 # python scrape_lyrics.py -a v
@@ -54,4 +54,4 @@ python scrape_lyrics.py -a s
 # python scrape_lyrics.py -a \!
 # python scrape_lyrics.py -a \#
 # python scrape_lyrics.py -a \:
-# python scrape_lyrics.py -a \}
+# python scrape_lyrics.py -a \}
diff --git a/scrape_lyrics.py b/scrape_lyrics.py
@@ -153,6 +153,7 @@ def scrape_lyrics(artist_name_starts_with):
             csvwriter.writerow(CSV_HEADER)
     df_no_lyrics = pd.read_csv(CSV_NO_LYRICS, encoding='utf-8')
     df_no_lyrics = df_no_lyrics.sort_values('msd_artist')
+    df_no_lyrics_update = pd.DataFrame(columns = CSV_HEADER)
 
     song_index = 0
     songs_skipped = 0
@@ -193,8 +194,8 @@ def scrape_lyrics(artist_name_starts_with):
                     if not song:
                         # no luck... on to the next one
                         # https://stackoverflow.com/questions/24284342/insert-a-row-to-pandas-dataframe/24287210
-                        df_no_lyrics.loc[-1] = row
-                        df_no_lyrics.index += 1
+                        df_no_lyrics_update.loc[-1] = row
+                        df_no_lyrics_update.index += 1
                         logger.debug('{0}: No luck (artist={1}, title={2}). Saved to no lyrics csv.'.format(song_index, row['msd_artist'], row['mxm_artist']))
                         continue
 
@@ -214,7 +215,7 @@ def scrape_lyrics(artist_name_starts_with):
         logger.info(kbi)
 
     logger.info('saving no lyrics csv...')
-    df_no_lyrics.to_csv(CSV_NO_LYRICS, encoding='utf-8', index=False)
+    df_no_lyrics_update.to_csv(CSV_NO_LYRICS, encoding='utf-8', index=False, mode='a', header=False)
     logger.info('done.')
 
     end = time.time()