# Step 1: Scraping Anime Sales Data

**Metis Project 2, Andrew Zhou**

We begin our project by scraping and parsing anie sales data from https://www.someanithing.com/series-data-quick-view
.

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import pickle
import time

In [2]:
url = "https://www.someanithing.com/series-data-quick-view"
soup = BeautifulSoup(requests.get(url).text, "lxml")

els = soup.find_all(lambda tag: tag.name == "td" and tag.parent.name == "tr")
els = map(lambda x: str(x.string), els)

# With some itertools manipulations, we can grab an entire row of data at
# a time sales_data is a list of 9-tuple, where each tuple contains the 
# scraped information for a single anime series
it = iter(els)
sales_data = list(zip(*[it]*9))

# The column names
cols = ["title", "year", "season_qtr", "avg_sales", "re_rls", "total", "gross_1st_rls", "studio", "source"]

anime_dict = {}

# When the starting letters reset, we've reached the end
# of TV series. We're not interested in shorter-form releases
# like movies, so we use this tracker to know when to stop.
prev_starting_letter = "A"

for anime_data in sales_data:
    single_series_info = dict(zip(cols, anime_data))    
    
    # one anime title doesn't parse properly
    if single_series_info["title"] == None:
        single_series_info["title"] = "Dog Days\""
    
    starting_letter = single_series_info["title"][0]

    # We've finished scraping TV series, so stop
    if prev_starting_letter == "Z" and starting_letter == "A":
        break
        
    prev_starting_letter = starting_letter
    
    anime_dict[single_series_info["title"]] = single_series_info

anime_df = pd.DataFrame.from_dict(anime_dict, orient="index")

Do some manual cleaning

In [3]:
# Manually fix certain studio names so we can properly match series
# to their studios in MyAnimeList. Full matching performed later.
fix_studios = {
    "G Plus": "AIC",
    "Douga Koubou": "Doga Kobo",
    "EMT²": "EMT Squared",
    "EMT² ": "EMT Squared",
    "Oriental Light and Magic": "OLM",
    "M.S.C.": "M.S.C",
    "Phoenix Animation": "BigFireBird Animation",
    "Studio Nut": "Nut",
    "DIomedia": "Diomedea",
    "Diomedia": "Diomedea",
    "DandeLion Animation": "DandeLion Animation Studio",
    "Haoliners": "Haoliners Animation League",
    "Studio APPP": "APPP",
    "Duame": "Daume",
    "Ezola": "Ezόla",
    "Pierrot+": "Pierrot Plus",
    "Studio Puyaki": "Studio PuYUKAI",
    "Studio PuYAKAI": "Studio PuYUKAI",
    "For All": "Shin-Ei Animation",
    "Asia-Do": "Ajia-Do",
    "M2": "Studio M2",
}

anime_df["studio"] = anime_df["studio"].map(lambda x: fix_studios[x] if x in fix_studios else x)

# The studio "Wonderfarm" is listed as producing these two series 
# and no others. MyAnimeList lists them with different studios, so 
# we fix that here.
anime_df.loc["Mousou Kagaku Series Wandaba Style"]["studio"] = "TNK"
anime_df.loc["Tenshi no Shippo"]["studio"] = "Tokyo Kids"
# Puso Ni Comi has no studio listed, but its producer is Sega and it
# can be found on Sega's page
anime_df.loc["Puso Ni Comi"]["studio"] = "Sega"

# Remove the second studio from anime with two studios for simplicity
anime_df = anime_df.applymap(lambda x: x if "," not in x else x.split(",")[0])

In [8]:
# studio names not yet matched to MAL
anime_df.to_pickle("../data/anime_sales_df_unmatched.pickle")