In [None]:
import pandas as pd

In [None]:
# Import a dataset containing a unique gameid and its name
Names = pd.read_csv('game_names.csv')


In [None]:
# Now import the twitch data. Twitch data contains games that are not contained in prize data and vice versa.
# Thus when we merge them, we will only use ones that shows up in both data. 
twitch = pd.read_csv('twitch.csv')
data = pd.merge(Names, twitch, on='GameName', how='inner')

In [None]:
# Now twitch data is cleaned. We will clean country_lang dataset

language = pd.read_csv('country_lang.csv')

In [None]:
# While scraping, we scraped something that are relavant, and they have shown up as missing value, so we will drop them.
language = language.dropna()

In [None]:
# Language value contains comma and also sometimes it contains multiple language. We will pick the first one and drop the rest. 
def string(text):
    text = str(text)
    return text.split(',')[0]

language['language'] = language['language'].apply(string)

In [None]:
# Now that we have cleaned the country_lang dataset, we can attach a language to the country data contained in broadband data. 

broad = pd.read_csv('broadband.csv')
broad = pd.merge(broad, language, left_on='Entity', right_on = 'country',how='inner')
broad.drop('country', axis=1)

# Now broadband data has language data attached to it. We want to make sure that language here is consistent with the language
# used in the twitch data. 

In [None]:
# To have a consistent language, take the symmetric difference of language sets to see what language is lacking. 
twi_lan = set(twitch['language'].unique())
bro_lan = set(broad['language'].unique())
print(twi_lan - bro_lan)
print(bro_lan - twi_lan)

In [None]:
# First langauge such as Chinese (Taiwan), Spanish (Mexico), etc must be Chinese and Spanish respectively. 
twitch['language'].loc[twitch['language']== 'Chinese (Hong Kong SAR)'] = 'Chinese'
twitch['language'].loc[twitch['language']== 'Chinese (Taiwan)'] = 'Chinese'
twitch['language'].loc[twitch['language']== 'Spanish (Mexico)'] = 'Spanish'
twitch['language'].loc[twitch['language']== 'Portuguese (Brazil)'] = 'Portuguese'

In [None]:
# We will still need to fix languages such as ltailans and Italian. 
# Also broadband do not have language named Czech and Polish. Moreover, Mandarin must be changed to Chinese. 
broad['language'].loc[broad['language'] == 'Catalan'] = 'Catalans'
broad['language'].loc[broad['language'] == 'Italian'] = 'Italians'
broad['language'].loc[broad['language'] == 'Hungarian'] = 'Hungarians'
broad['language'].loc[broad['language'] == 'Modern Greek'] = 'Greek'
broad['language'].loc[broad['language'] == 'Malay'] = 'Malaysian'
broad['language'].loc[broad['language'] == 'Mandarin'] = 'Chinese'

In [None]:
# Twitch data contains the language called Polish, but we do not have polish in our sample. 
# For that reason we will assign German to Polish in Twitch data. 
twitch['language'].loc[twitch['language']== 'Polish'] = 'German'

In [None]:
# Finally, the remaining language in bro_lan - twi_lan should be converted to Other category. 
# Put the set_prize - set_twitch language category to Other category. 
wi_lan = set(twitch['language'].unique())
bro_lan = set(broad['language'].unique())

# Create a function that converts rest of them to "Other"
def change_names(textstr):
    if textstr in list(bro_lan - wi_lan):
        return 'Other'
    else:
        return textstr

# Run the function on the dataframe  
broad['language']=broad['language'].apply(change_names)

In [None]:
# now that we have fixed that, let us check again if we still need any fix. 
# Note that broadband data do not have Czech in the sample. So we will drop it when merging twitch and broadband data. 
# Moreover, All languages and American Sign Language will also be dropped when merging. 
twi_lan = set(twitch['language'].unique())
bro_lan = set(broad['language'].unique())
print(twi_lan - bro_lan)
print(bro_lan - twi_lan)

In [None]:
# Now that twitch data and broadband data is cleaned, we will now clean the Prize data. However, doing this is a difficult task. 
# First of all, prize data contain location information that is just not formatted in a consistent format. 
# Second of all, the data does not contain the game name. So we will do that as well. 
# Thirdly, I would like to create a seperate column for year and month using startdata as date. 

prize = pd.read_csv('prize_money.csv')
game_nam = pd.read_csv('game_names.csv')

# First let us merge two data using GameID as a key. 
prize = pd.merge(prize, game_nam, on='GameId')

In [None]:
# Create year and month column
month_names = {
    1: 'january', 2: 'february', 3: 'march', 4: 'april',
    5: 'may', 6: 'june', 7: 'july', 8: 'august',
    9: 'september', 10: 'october', 11: 'november', 12: 'december'
}

prize["date"]= pd.to_datetime(prize['StartDate'], format='%m/%d/%y', errors='coerce')
# One data contains NA value. 
prize[prize['date'].isna()]

In [None]:
# We will fix NA value and run the code again. 

prize.loc[47592, 'StartDate'] = '5/7/20'

prize["date"]= pd.to_datetime(prize['StartDate'], format='%m/%d/%y', errors='coerce')
prize[prize['date'].isna()]

In [None]:
# Make the year and month column and drop StartDate and EndDate. 
prize['year'] = prize['date'].dt.year
prize['month'] = prize['date'].dt.month.map(month_names)
prize = prize.drop(['StartDate','EndDate'], axis=1)
prize.tail()

In [None]:
# Now we would like to clean the location data. To do this, we will make things consistent by converting all into a lowercase.
prize["Location"] = prize['Location'].str.lower()

# Our strategy is to take the unique value of each location data and convert them into a dataframe. 
# Then find a corresponding country data using a library. 

location = pd.DataFrame({'location': list(prize['Location'].str.lower().unique())})
location.head()

In [None]:
# This function will take the locaiton data and find the corresponding country. 
# Running this might take a while. 

from geopy.geocoders import Nominatim

def get_country(location):
    geolocator = Nominatim(user_agent="location_identifier")
    location_info = geolocator.geocode(location, language='en', timeout=10)
    
    if location_info:
        return location_info.address.split(",")[-1].strip()
    else:
        return "Not Found"

# We will assign the country and save it to location.csv so that we do not need to run it again. 
location['country'] = location['location'].apply(get_country)
location.to_csv('uni_loc1.csv', index=False)


In [None]:
# Download the file
location = pd.read_csv('/Users/yuyaogawa/Documents/Home Work/Research with Dr. Ward/uni_loc1.csv')

In [None]:
# The function has assigned 116 unique country to the location data. However, some of them are not accurate. 
pd.DataFrame(location['country'].unique()).count()

# I attempted to find an efficient way to fix this, but manually fixing it turned out to be the easiest solution. 
# I have created a csv file that is already fixed and combined them with the prize data, and we will download it here.
# The data file is available upon request: yuya19991230@gmail.com

prize = pd.read_csv('clean_prize_money.csv')

In [None]:
game = pd.DataFrame(prize['GameName'].unique()).sort_values(by=0).reset_index(drop=True)

# We will create a column containing the first word of the game name. 
game[1] = game[0].str.split().str.get(0)

# Seems like the games that share the same first word are essentially the equivalent games. 

# Create a column called name
game['name'] = 0
game.at[0, 'name'] = 'ARMS'
# The following code will give the same name of the game if the first word is the same. 
for i in range(len(game)-1):
    if game.iloc[i][1] == game.iloc[i+1][1]:
        game.at[i+1, 'name'] = game.iloc[i]['name']
    else:
        game.at[i+1, 'name'] = game.iloc[i+1][0]

# Result is successful. 
game = game.drop(1, axis=1)
game

In [None]:
# Now that we have identified equivalent games, we will merge game column and prize column using 0 as a key. 
# We will merge the game name data with twitch as well.

prize = pd.merge(prize, game, left_on='GameName', right_on=0)
prize = prize.drop(['GameName', 0], axis = 1)
prize = prize.rename(columns={'name': 'GameName'})


In [None]:
twitch = pd.merge(twitch, game, left_on='GameName', right_on=0)
twitch = twitch.drop(['GameName', 0], axis = 1)
twitch = twitch.rename(columns={'name': 'GameName'})


In [None]:
# Before merging this dataset with broadband data, we need to aggregate prize data as well. 
# We will do so by taking the average and std conditional on country, GameName, and year. 

prize = prize.groupby(['country', 'year', 'GameName', 'language']).agg({
    'TotalUSDPrize': ['mean', 'std']
}).reset_index()

# Rename columns for clarity
prize.columns = [f'{col[0]}_{col[1]}' if col[1] != '' else col[0] for col in prize.columns]

In [None]:
prize.tail()

In [None]:
broad.tail()

In [None]:
# Finally, the prize data is fixed. 
# Now we will merge broadband data and prize data using country and year as a key. 
# broad contains irrelevant columns so we will drop them.
broad = broad.drop(['Entity', 'Code'], axis = 1)
broad = broad.rename(columns={'Year':'year'})
prize = pd.merge(prize, broad, on = ['country', 'year'])

In [None]:
# We will keep the language_y which is consistent with twitch data. 
prize = prize.drop('language_x', axis=1)
prize = prize.rename(columns={'language_y':'language'})
prize.head()

In [None]:
# When we merge twitch data with the rest of them, we would like to merge using year and country as a key. 
# But the problem is that twitch data is monthly data and broadband is an yearly data. 
# For above reason, we will have to aggregate twitch data into an yearly data. 
# To do this, we will take the average of view times, stream times, etc, conditional on games, year, and language. 
# In aggregated twitch data, We will create a column called average and standard deviation. 

agg_twitch = twitch.groupby(['year', 'language', 'GameName']).agg({
    'watch_time_min': ['mean', 'std'],
    'stream_time_min': ['mean', 'std'],
    'peak_viewers': ['mean', 'std'],
    'peak_channels': ['mean', 'std'],
    'streamers': ['mean', 'std']
}).reset_index()

agg_twitch.columns = [f'{col[0]}_{col[1]}' if col[1] != '' else col[0] for col in agg_twitch.columns]


In [None]:
print(len(agg_twitch), len(prize), len(broad))

In [None]:
# Now that we have aggregated data for twitch, we can merge prize data and twitch using language, year, and GameName as a key.
# Before that, we will drop the country column from prize data. 
prize = prize.drop('country', axis=1)
final_data = pd.merge(prize, agg_twitch, on=['year', 'language','GameName'], how='inner')
final_data 


In [None]:
final_data.to_csv('final_data.csv', index=False)