In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import wikipediaapi

import requests, re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import unidecode

from tqdm import tqdm_notebook


In [2]:
wiki = wikipediaapi.Wikipedia('en')

In [3]:
freshmen_1118 = []

def get_debut_albums():
    
    for i in tqdm_notebook(range(1, 8+1)):
        result = wiki.page("Category:201"+str(i)+" debut albums")
        freshmen_1118.extend(list(result.categorymembers))
    
    print(len(freshmen_1118))

In [4]:
get_debut_albums()


2011


In [5]:
def get_freshmen_data(list):
    
    df = pd.DataFrame(columns=['artist', 'album', 'genre', 'single_count', 'release_date'])
    
    for albums in tqdm_notebook(list):
        url = "https://en.wikipedia.org/wiki/" + str(albums)
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        tmp_tag = soup.find(id='content')
        
        
        try:
            artist = tmp_tag.find(class_="contributor").get_text()
        except AttributeError:
            artist = "N/A"
        
        
        try:
            album = tmp_tag.find(class_="summary").get_text()
        except AttributeError:
            album = "N/A"
        
        genre_ls = []
        try:
            for i in range(len(tmp_tag.select('.category a[href*="/wiki/"]'))):
                genre = tmp_tag.select('.category a[href*="/wiki/"]')[i].get_text()
                genre_ls.append(genre)
        except AttributeError:
            genre_ls = []
        genre_ls = ", ".join(genre_ls)
        
        try:
            single_count = len(tmp_tag.select("tr td div ol li"))
        except AttributeError:
            single_count = 0
            
            
        try:
            release_date = tmp_tag.select('.published')[0].get_text()
        except Exception as e:
            release_date = "unknown"

        
        
        data = {"artist" : artist,
            "album" : album,
            "genre" : genre_ls,
            "single_count" : single_count,
            "release_date" : release_date,
               }

        df.loc[len(df)] = data
    
    return df

In [8]:
ls_debut = get_freshmen_data(freshmen_1118)




In [9]:
print(ls_debut.shape)
ls_debut.tail()

(2011, 5)


Unnamed: 0,artist,album,genre,single_count,release_date
2006,EDEN,vertigo,,3,19 January 2018 (2018-01-19)
2007,Nipsey Hussle,Victory Lap,West Coast hip hop,2,"February 16, 2018"
2008,Rich the Kid,The World Is Yours,,3,"March 30, 2018"
2009,Myles Kennedy,Year of the Tiger,,0,"March 9, 2018 (2018-03-09)"
2010,Fickle Friends,You Are Someone Else,,6,16 March 2018[1]


### Examine Scraping Errors

In [10]:
ls_debut = ls_debut[ls_debut['album'].isin(['N/A']) == False]

In [11]:
# Row 1735 happens to be K-pop group "Nine Muses"
# Not relevant to the data purpose, thus drop

ls_debut[ls_debut['artist'].isin(['N/A'])]

Unnamed: 0,artist,album,genre,single_count,release_date
319,,Talk About Body,"Electropop, Political music",3,"February 1, 2011 (February 1, 2011)"
1736,,"""Muses Diary""",,0,unknown


In [12]:
ls_debut.drop([1735], inplace=True)

In [13]:
# Fill in artist info for row 1735 (https://en.wikipedia.org/wiki/Talk_About_Body)

ls_debut[ls_debut['artist'].isin(['N/A'])]

Unnamed: 0,artist,album,genre,single_count,release_date
319,,Talk About Body,"Electropop, Political music",3,"February 1, 2011 (February 1, 2011)"
1736,,"""Muses Diary""",,0,unknown


In [14]:
ls_debut['artist'][319] = 'MEN'

In [34]:
ls_debut['artist'][1736]

'N/A'

### replace accent characters and commas

In [15]:
def decodeuni(str):
    return unidecode.unidecode(str)

In [16]:
ls_debut['artist'] = ls_debut['artist'].apply(decodeuni)
ls_debut['album'] = ls_debut['album'].apply(decodeuni)
ls_debut['artist'] = ls_debut['artist'].apply(lambda x : x.replace(",", ""))
ls_debut['album'] = ls_debut['album'].apply(lambda x : x.replace(",", ""))

### Drop duplicate entries

In [63]:
# Drop duplicate entries. 
# Keep the earlier debut album, drop the latter ones.

ls_debut.drop_duplicates(subset='artist', keep='first', inplace=True)

In [52]:
ls_debut.drop(index=1693, inplace=True)

In [54]:
ls_debut.reset_index(drop=True, inplace=True)

In [56]:
ls_debut.to_csv("../data/debut_album_1118.csv", index=False)

### Fill in missing genres my manual imputation

* * * *

### Load csv file to double check

In [64]:
df = pd.read_csv("../data/debut_album_1118-utf8.csv")

In [65]:
print(df.shape)
df.head()

(1951, 5)


Unnamed: 0,artist,album,genre,single_count,release_date
0,2Cellos,2Cellos,"Cello rock, classical crossover",3,"June 10, 2011"
1,Deep Dark Robot,8 Songs About a Girl,"Post-grunge, punk blues",0,"MarchÂ 22,Â 2011Â (2011-03-22)"
2,The Young Professionals,9am to 5pm 5pm to Whenever\r(09:00 to 17:00 17...,Electronic,0,"September 12, 2011 (Israel)\rJune 18, 2012 (in..."
3,Flash Bang Grenada,10 Haters,Alternative hip hop,0,"AugustÂ 23,Â 2011Â (2011-08-23)"
4,Her Majesty & the Wolves,111,"Dance, pop",3,"July 11, 2011 (UK)"


### Upload to MySQL database

In [66]:
import sqlalchemy, pickle
from sqlalchemy import create_engine

In [67]:
pw = pickle.load(open("mysql_pw.pickle", "rb"))

In [68]:
engine = sqlalchemy.create_engine("mysql+mysqldb://root:" + pw + "@52.78.44.120/project_rookie")

In [69]:
df.to_sql(name="debut_album", con=engine, if_exists='replace')