In [56]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
import re
import pickle
import time
import itertools
import string



def get_soup(url):
    page = requests.get(url).text
    return BeautifulSoup(page, "lxml")

anime_sales_df = pd.read_pickle("../data/anime_sales_df_v1.pickle")


In [57]:
def get_anime_of_studio(studio_url):
    anime_titles = []
    anime_links = []
    print(studio_url)
    page_num = 1
    while True:
        time.sleep(2)
        
        page_url = studio_url + "?page=" + str(page_num)
        soup = BeautifulSoup(requests.get(page_url).text, "lxml")
        
        anime_els = soup.find_all("h2", class_="h2_anime_title")
        
        for el in anime_els:
            if el.get_text() != str(el.string):
                print(el.get_text(), str(el.string))
        
        
        if anime_els:
            anime_titles += list(map(lambda x: str(x.string), anime_els))
            anime_links += list(el.a["href"] for el in anime_els)
        else:
            break
            
        page_num += 1
            
    return anime_titles, anime_links

In [60]:
def get_mal_studio_list():
    studios_url = "https://myanimelist.net/anime/producer"
    soup = get_soup(studios_url)
    
    studio_els = soup.find_all("a", href=re.compile(r"/anime/producer/"))
    
    studio_names = [re.match(r"(.*) \(\d+\)", el.get_text()).group(1).strip() for el in studio_els]
    studio_links = ["https://myanimelist.net" + el.attrs["href"] for el in studio_els]
    
    return studio_names, studio_links

def studio_name_matcher(sales_studio_names, mal_studio_names):
    
    translator = str.maketrans('', '', string.punctuation + ' ')
    def find_mod_name(name):
        return name.lower().translate(translator)
    
    matcher = {}
    
    mal_studio_mod_names = list(map(find_mod_name, mal_studio_names))
    
    for name in sales_studio_names:
        mod_name = find_mod_name(name)
        match_idx = mal_studio_mod_names.index(mod_name)
  
        try:
            matcher[name] = mal_studio_names[match_idx]
        except:
            print("Exception: Studio name unmatched")
            print("Studio name:", name)
        
    return matcher

In [61]:
mal_studio_names, mal_studio_links = get_mal_studio_list()
sales_studio_names = anime_sales_df["studio"].unique()
matcher = studio_name_matcher(sales_studio_names, mal_studio_names)

anime_sales_df["studio"] = anime_sales_df["studio"].map(lambda x: matcher[x])

anime_sales_df.to_pickle("anime_sales_df_v2.pickle")

In [69]:
# how to get this to be able to save info during execution?
# rewrite so it calls a function to get an individual studio's info
# maybe a try except can help with interruption?
def get_studio_df(matched_studio_names):
    
    studio_df = pd.DataFrame(columns=["link", "anime_info"])
    
    # pass it a dict of title -> link
    studios_url = "https://myanimelist.net/anime/producer"
    soup = get_soup(studios_url)
    
    studio_els = soup.find_all("a", href=re.compile(r"/anime/producer/"))
    
    for el in studio_els:
        studio_name = re.match(r"(.*) \(\d+\)", el.get_text()).group(1).strip()
        if studio_name in matched_studio_names:
            link = "https://myanimelist.net" + el.attrs["href"]
            anime_names, anime_links = get_anime_of_studio(link)
        
            anime_info = dict(zip(anime_names, anime_links))
            
            studio_row = pd.Series(data=[link, anime_info], 
                                    index=["link", "anime_info"], name=studio_name)
            
            studio_df = studio_df.append(studio_row)
    
    return studio_df


In [70]:
studio_df = get_studio_df(anime_sales_df["studio"].unique())
studio_df.to_pickle("../data/studio_df_v1.pickle")

https://myanimelist.net/anime/producer/441/8bit
https://myanimelist.net/anime/producer/56/A-1_Pictures
https://myanimelist.net/anime/producer/1257/A-Real
https://myanimelist.net/anime/producer/179/ACGT
https://myanimelist.net/anime/producer/2085/Acca_effe
https://myanimelist.net/anime/producer/60/Actas
https://myanimelist.net/anime/producer/48/AIC
https://myanimelist.net/anime/producer/88/AIC_ASTA
https://myanimelist.net/anime/producer/436/AIC_Build
https://myanimelist.net/anime/producer/1306/AIC_Classic
https://myanimelist.net/anime/producer/292/AIC_Plus_
https://myanimelist.net/anime/producer/83/AIC_Spirits
https://myanimelist.net/anime/producer/30/Ajia-Do
https://myanimelist.net/anime/producer/1983/Anima_Co
https://myanimelist.net/anime/producer/77/APPP
https://myanimelist.net/anime/producer/38/Arms
https://myanimelist.net/anime/producer/8/Artland
https://myanimelist.net/anime/producer/406/Asahi_Production
https://myanimelist.net/anime/producer/163/Asread
https://myanimelist.net/ani

In [68]:
anime_sales_df["studio"].unique()

array(['Bee Train', 'Ishimori Entertainment', 'Studio Deen', 'Doga Kobo',
       'Gonzo', 'A-1 Pictures', 'Lilix', 'Shaft', 'Gathering',
       'Hoods Entertainment', 'Shuka', 'Studio Gokumi', 'P.A. Works',
       'AIC', 'Gainax', '8bit', 'Madhouse', 'Sunrise', 'Diomedea',
       'Production IMS', 'Drive', 'HOTZIPANG', 'Dream Creation', 'Seven',
       'Bones', 'J.C.Staff', 'Kyoto Animation', 'Toei Animation',
       'TMS Entertainment', 'LIDENFILMS', 'Radix', 'White Fox', 'TNK',
       'DandeLion Animation Studio', 'Studio Pierrot', 'Satelight',
       "Brain's Base", 'Yumeta Company', 'Zexcs', 'Magic Bus',
       'EMT Squared', 'Silver Link.', 'NAZ', 'Shin-Ei Animation',
       'WAO World', 'Rising Force', 'Lerche', 'Nippon Animation',
       'Production I.G', 'Group TAC', 'Hal Film Maker', 'Asread',
       'Dynamo Pictures', 'Seven Arcs', 'Chaos Project', 'Tokyo Kids',
       'Bibury Animation Studios', 'Bandai Namco Pictures', 'Revoroot',
       'AIC A.S.T.A.', 'MAPPA', 'Xebec', 'K