In [1]:
import pandas as pd
from ftfy import fix_text
import csv

# get the data
data = pd.read_csv("all_items_gplay.csv", quoting=2)

In [2]:
data.head(30)

Unnamed: 0,game_id,item_id,n,median_amount_USD
0,94%,Pack Premium,1.0,2.99
1,94%,Premium pack,1.0,3.792736
2,2048,Remove Ads,1.0,0.99
3,50000,Coin Mega Pack,1.0,1.99
4,#DRIVE,25000 Caps + 50 Postcards,1.0,2.12
5,#DRIVE,Ads Remove Plus,1.0,6.38
6,#DRIVE,Remove Ads,1.0,3.158499
7,#open Polyamorous Dating + ENM,Supporting Membership,5.0,10.76
8,?°ã©?³ã?«ã¼?ã¡?³ã¿?¸ã¼,?°ã©?ã«?³ã¤??3000GC,6.0,25.255
9,[Premium] RPG Chronus Arc,100CAP,2.0,0.99


In [3]:
# show all the row where item id starts with ""
data[data["item_id"].str.startswith('"')]

Unnamed: 0,game_id,item_id,n,median_amount_USD
5188,Caesars Casino: Casino & Slots For Free,"""Mini"" Package",3.0,3.17
5192,Caesars Casino: Free Slots Games,"""Mini"" Package",9.0,3.17
5222,Caesars Slots: Casino games,"""Mini"" Package",4.0,3.17
5244,Caesars Slots: Free Slot Machines and Casino G...,"""Mini"" Package",4.0,3.18
10983,Drink Roulette ðº Brit' Drinking Game app,"""WTF"" pack",1.0,5.06121
13645,Fotogenic : Body & Face tune and Retouch Editor,"""Pro""",1.0,5.822294
13646,Fotogenic : Face & Body tune and Retouch Editor,"""Pro""",1.0,7.32
16267,Hero Wars - Menâs Choice Epic Fantasy RPG,"""300 emeralds""",1.0,1.99
16268,Hero Wars - Menâs Choice Epic Fantasy RPG,"""600 emeralds""",3.0,1.99
16270,Hero Wars â Fantasy Battles,"""300 emeralds""",2.0,2.524262


In [4]:
# fix mojibake
for col in data.select_dtypes(include='object'):
    data[col] = data[col].astype(str).apply(fix_text)
    
    
import re
# find all the parenthesis and content in the game id and remove the parenthesis
find_parenthesis = re.compile(r"\(([^)]*)\)")



# apply the regex to the 'item_id' column and extract the last match
data['game_name_from_item_id'] = data['item_id'].apply(
    lambda x: find_parenthesis.findall(x)[-1] if find_parenthesis.findall(x) else ""
)

# remove the parenthesis from the game_name_from_item_id
data['game_name_from_item_id'] = data['game_name_from_item_id'].str.replace(r"\(|\)", "", regex=True)

# drop "Set" from the game_name_from_item_id
data['game_name_from_item_id'] = data['game_name_from_item_id'].str.replace("Set", "", regex=False)

In [5]:
# create the game_name column

data["game_name"] = data.apply(
    lambda row: row["game_name_from_item_id"] if "Reward:" in row[col] else row["game_id"],
    axis=1
)


In [6]:
data["game_name"].value_counts()

game_name
Coin Master                     630
Love Nikki-Dress UP Queen       438
DRAGON BALL Z DOKKAN BATTLE     336
Star Wars™: Galaxy of Heroes    183
GODDESS OF VICTORY: NIKKE       167
                               ... 
Zombies.io                        1
Zombies, Run! 11                  1
Zombies, Run! 10                  1
Zombies, Run!                     1
ZOMBIES ATE MY FRIENDS            1
Name: count, Length: 10793, dtype: int64

In [7]:
# if the game_name start with [] remove the brackets and the content inside
data["game_name"] = data["game_name"].str.replace(r"\[.*?\]", "", regex=True)

# if the game_name ends with [] remove the brackets and the content inside
data["game_name"] = data["game_name"].str.replace(r"\[.*?\]$", "", regex=True)

# convert the game_name to lowercase
data["game_name"] = data["game_name"].str.lower()

# remove the trademark and copyright symbols
data["game_name"] = data["game_name"].str.replace(r"™|®", "", regex=True)

# remove the extra spaces
data["game_name"] = data["game_name"].str.replace(r"\s+", " ", regex=True)
data["game_name"] = data["game_name"].str.strip()

# remove the special characters
data["game_name"] = data["game_name"].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True)



In [8]:
# for all the columns with no game_name check split the item_id by , and take the first element
data["game_name"] = data.apply(
    lambda row: row["item_id"].split(",")[0] if row["game_name"] == "" else row["game_name"],
    axis=1
)

# for all the columns where game_id is "ad free" or "Ad free", the game_name is "item_id"
data["game_name"] = data.apply(
    lambda row: row["item_id"] if row["game_id"] in ["ad free", "Ad free", "Ad Free","Ads-free"] else row["game_name"],
    axis=1
)

In [9]:
def normalize_game_name(name):
    if pd.isna(name):
        return name
    words = name.strip().split()
    words.sort()
    return ' '.join(words)

In [10]:
data["normalized_game_name"] = data["game_name"].apply(normalize_game_name)

In [11]:
data["normalized_game_name"].value_counts()

normalized_game_name
coin master                      630
love nikkidress queen up         438
ball battle dokkan dragon z      336
force marvel rpg squad strike    239
galaxy heroes of star wars       200
                                ... 
zombiesio                          1
11 run zombies                     1
10 run zombies                     1
run zombies                        1
ate friends my zombies             1
Name: count, Length: 10496, dtype: int64

In [12]:
# drop the row where normalized_game_name is empty
data = data[data["normalized_game_name"].notna() & (data["normalized_game_name"] != "")]
# drop the row where game_name contains kanji characters
data = data[data["game_name"].str.encode('ascii', errors='ignore').str.decode('ascii') == data["game_name"]]


In [13]:
categories_words = ["app", "apps", "mobile", "phone",
 "tablet", "free", "lite", "pro",
 "hd", "3d", "4d", "online", "offline",
 "real", "live"]

functionality_words =["tool", "tools", "manager", "booster", "cleaner", "optimizer",
 "scanner", "protection", "safe", "safety", "guard", "shield", "filter",
 "locker", "security", "antivirus", "antivirus", "vpn"
]

marketing_words = ["best", "top", "ultimate", "new", "latest",
 "popular", "amazing", "powerful", "easy", "fast", "super", "smart",
 "cool", "master", "expert", "genius", "plus"]

misc_words = ["guide", "tutorial", "walkthrough", "manual", "tips", "tricks",
 "how", "hack", "prank", "mod", "wallpaper", "theme", "launcher", "keyboard",
 "emoji", "camera", "photo", "editor"]

communication_words = ["call", "caller", "sms", "message", "chat",
 "id", "blocker", "blocking", "spam", "contacts", "dialer",
 "recorder"]

versions_words = ["2020", "2021", "2022", "2023", "2024", "2025",
 "version", "update", "old", "new"]


devices_words = ["android", "ios", "windows", "pc", "mac",
    "device", "system", "software",
    "mobile", "phone", "tablet", "smartphone", "computer", "laptop",
    "desktop", "notebook", "netbook", "ultrabook", "chromebook",
    "wearable", "smartwatch", "smartband", "smartglasses",
    ]

quantity_words = ["meter", "tracker", "counter", "analyzer", "monitor"]

gaming_words = ["game", "games", "play", "player", "music", "video", "movie", "stream", "streaming"]

join_words = ["for", "with", "and", "in", "of", "to", "on", "at",]

In [14]:
# remove the words from the game_name and game_id columns
for word in categories_words + functionality_words +\
 marketing_words + misc_words + communication_words +\
versions_words + devices_words + quantity_words + gaming_words + join_words:
    data["normalized_game_name"] = data["normalized_game_name"].str.replace(r"\b" + word + r"\b", "", regex=True)


In [15]:
# save the data to a csv file
data.to_csv("all_items_gplay_with_game_names.csv", index=False)

In [16]:
data["normalized_game_name"].value_counts()

normalized_game_name
coin                             630
love nikkidress queen up         438
ball battle dokkan dragon z      336
force marvel rpg squad strike    239
galaxy heroes  star wars         200
                                ... 
day judgment zombie                1
50000                              1
2048                               1
connect one platform  zoom         1
marvel move run zombies zrx        1
Name: count, Length: 10444, dtype: int64

In [17]:
for name in data["normalized_game_name"].unique():
    print(name)

94
2048
50000
drive
dating enm open polyamorous
arc chronus rpg
airaudio   your
cflumen
servicely
dude missile missile rpg tap tap
0xuniverse blockchainbased conquer galaxy the
1 2 3 4   
10 billion husbands
10 billion wives
10 bowling pin shuffle
 west
build community   pococha  your
bleacher news report sports
3 animals farm farmville
makeover nikkifashion shining
grand mafia the
hideaway hotel virtual world
animal camp crossing pocket
pet rescue saga
100 days survival zombie
100 doors escape from prison
100 challenge doors
100  pics quiz quizzes
candy crush saga
777 casino royale slots vegas
disney mirrorverse
genshin impact
webtoon
tactics teamfight tft
mahjong soul
associations sort word words
Dicey Dungeons
 Driving Test Theory UK
fifa soccer
fifa football
dragon quest tact
cookie ovenbreak run
ayakashi  otome reborn romance supernatural
ensemble  stars
epic seven
clans clash 
 tears themis
ball dragon legends
candy crush saga soda
100 life simulator years
rpg
boysgreasy money pa