In [1]:
import pandas as pd
from ftfy import fix_text
import csv
import json

# get the data
data = pd.read_csv("input_data/all_items_gplay.csv", quoting=2)

In [2]:
# fix mojibake
for col in data.select_dtypes(include='object'):
    data[col] = data[col].astype(str).apply(fix_text)
    
    
import re
# find all the parenthesis and content in the game id and remove the parenthesis
find_parenthesis = re.compile(r"\(([^)]*)\)")


# apply the regex to the 'item_id' column and extract the last match
data['game_name_from_item_id'] = data['item_id'].apply(
    lambda x: find_parenthesis.findall(x)[-1] if find_parenthesis.findall(x) else ""
)

# remove the parenthesis from the game_name_from_item_id
data['game_name_from_item_id'] = data['game_name_from_item_id'].str.replace(r"\(|\)", "", regex=True)

# drop "Set" from the game_name_from_item_id
data['game_name_from_item_id'] = data['game_name_from_item_id'].str.replace("Set", "", regex=False)

In [3]:
# create the game_name column

data["game_name"] = data.apply(
    lambda row: row["game_name_from_item_id"] if "Reward:" in row[col] else row["game_id"],
    axis=1
)


In [4]:
data["game_name"].value_counts()

game_name
Coin Master                     630
Love Nikki-Dress UP Queen       438
DRAGON BALL Z DOKKAN BATTLE     336
Star Wars™: Galaxy of Heroes    183
GODDESS OF VICTORY: NIKKE       167
                               ... 
Zombies.io                        1
Zombies, Run! 11                  1
Zombies, Run! 10                  1
Zombies, Run!                     1
ZOMBIES ATE MY FRIENDS            1
Name: count, Length: 10793, dtype: int64

In [5]:
# if the game_name start with [] remove the brackets and the content inside
data["game_name"] = data["game_name"].str.replace(r"\[.*?\]", "", regex=True)

# if the game_name ends with [] remove the brackets and the content inside
data["game_name"] = data["game_name"].str.replace(r"\[.*?\]$", "", regex=True)

# convert the game_name to lowercase
data["game_name"] = data["game_name"].str.lower()

# remove the trademark and copyright symbols
data["game_name"] = data["game_name"].str.replace(r"™|®", "", regex=True)

# remove the extra spaces
data["game_name"] = data["game_name"].str.replace(r"\s+", " ", regex=True)
data["game_name"] = data["game_name"].str.strip()

# remove the special characters but keep accents and spaces
#data["game_name"] = data["game_name"].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True)




In [6]:
# for all the columns with no game_name check split the item_id by , and take everything but the last element
data["game_name"] = data.apply(
    lambda row: ",".join(row["item_id"].split(",")[:-1]) if row["game_name"] == "" else row["game_name"],
    axis=1
)
# for all the columns where game_id is "ad free" or "Ad free", the game_name is "item_id"
data["game_name"] = data.apply(
    lambda row: row["item_id"] if row["game_id"] in ["ad free", "Ad free", "Ad Free","Ads-free","50% OFF"] else row["game_name"],
    axis=1
)

In [7]:
def normalize_game_name(name):
    if pd.isna(name):
        return name
    words = name.strip().split()
    words.sort()
    return ' '.join(words)

In [8]:
data["normalized_game_name"] = data["game_name"].apply(normalize_game_name)

In [9]:
data["normalized_game_name"].value_counts()

normalized_game_name
coin master                       630
love nikki-dress queen up         438
ball battle dokkan dragon z       336
galaxy heroes of star wars:       183
goddess nikke of victory:         167
                                 ... 
- 1 2 3 4 games offline player      1
10 billion husbands                 1
10 bowling pin shuffle              1
defense: survival war z zombie      1
2:survive frontier zombie           1
Name: count, Length: 10672, dtype: int64

In [10]:
# save the data to a csv file
data.to_csv("output_data/all_items_gplay_with_game_names.csv", index=False)

In [11]:
# save the list of unique game names to a json file
unique_game_names = data["game_name"].unique()


['94%' '2048' '50,000' ... 'z-warrior chronicles'
 'zynga poker – texas holdem'
 'zynga poker – free texas holdem online card games']
