In [54]:
import pandas as pd
from pandas import option_context
import numpy as np
import re
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, PCA
from sklearn.metrics import pairwise_distances

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [2]:
df = pd.read_csv('04-data/preprocessed_app_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22015 entries, 0 to 22014
Data columns (total 56 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     22015 non-null  object 
 1   description               22015 non-null  object 
 2   summary                   22014 non-null  object 
 3   installs                  22015 non-null  object 
 4   minInstalls               22015 non-null  float64
 5   score                     22015 non-null  float64
 6   ratings                   22015 non-null  float64
 7   reviews                   22015 non-null  float64
 8   histogram                 22015 non-null  object 
 9   price                     22015 non-null  float64
 10  free                      22015 non-null  int64  
 11  currency                  22015 non-null  object 
 12  sale                      22015 non-null  bool   
 13  offersIAP                 22015 non-null  bool   
 14  inAppP

In [3]:
df = df[df['description_clean'].notna()]
df = df[df['score'] >= 3.8]

In [4]:
df_analysis = df[['title','description_clean']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [5]:
df_analysis

Unnamed: 0,title,description_clean
0,World War 2: Offline Strategy,command allies in 25 epic world war 2 locatio...
1,"All PDF - PDF Reader, PDF Viewer & PDF Converter",pdf reader for android pdfs converter free is ...
2,MSN Sports - Scores & Schedule,be in a league of your ownget real-time game u...
3,QRbot: QR & barcode reader,scan all kinds of qr codes and barcodes with t...
4,QR & Barcode Scanner,qr barcode scannerthis qr barcode scanner wi...
...,...,...
17136,Aksara Jawa - Nulis Aksara Jawa | Ketik & Konv...,aksara jawa adalah salah satu aset budaya indo...
17137,Mp3 Songs Download,the application provides search stream and dow...
17138,PDF Maker,main features of orangepalm s pdf maker app- s...
17139,DSLR Blur Photo,this app lets you blur parts of your photo whi...


In [6]:
stop = stopwords.words('english')
stop.extend(['free','new','get','hd','use','game','games','make','makes','play',
             'fun','features', 'need','live', 'also','using','best','us','app','apps', 'one', '2020','2021',
            'this','like','enjoy','thing','free'])

# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])

# # Extend for most common irregular verbs (except pay,lose,send,buy,spend)

# stop.extend(['say','make','go','take','come','see','know','get','got','give',
#             'find','think','tell','show','leave','feel','put','bring',
#             'begin','keep','hold','stand','hear','let','mean','set',
#             'sit','lie','lead','fall','break','rise','drive','choose'])

# ## Extend for prepositions
stop.extend(['without','among'])

## Stemmed

In [7]:
df['tokens'] = df.description_clean.apply(nltk.word_tokenize)

In [8]:
stemmer = PorterStemmer()

df['stemmed'] = df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

In [9]:
df.tokens = df.tokens.apply(lambda x: ' '.join(x))

In [10]:
df.stemmed = df.stemmed.apply(lambda x: ' '.join(x))

In [11]:
df_analysis = df[['title','stemmed']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [12]:
df_analysis

Unnamed: 0,title,stemmed
0,World War 2: Offline Strategy,command alli in 25 epic world war 2 locat we h...
1,"All PDF - PDF Reader, PDF Viewer & PDF Converter",pdf reader for android pdf convert free is one...
2,MSN Sports - Scores & Schedule,be in a leagu of your ownget real-tim game upd...
3,QRbot: QR & barcode reader,scan all kind of qr code and barcod with the q...
4,QR & Barcode Scanner,qr barcod scannerthi qr barcod scanner will le...
...,...,...
17136,Aksara Jawa - Nulis Aksara Jawa | Ketik & Konv...,aksara jawa adalah salah satu aset budaya indo...
17137,Mp3 Songs Download,the applic provid search stream and download a...
17138,PDF Maker,main featur of orangepalm s pdf maker app- sel...
17139,DSLR Blur Photo,thi app let you blur part of your photo which ...


In [13]:
vectorizer = CountVectorizer(stop_words = stop)

doc_word = vectorizer.fit_transform(df_analysis.stemmed)



In [14]:
nmf_model = NMF(30)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_

In [15]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['card',
  'account',
  'busi',
  'bank',
  'creat',
  'invit',
  'money',
  'solitair',
  'player'],
 ['car', 'drive', 'race', 'park', 'drift', 'stunt', 'simul', 'extrem', 'real'],
 ['photo',
  'frame',
  'editor',
  'effect',
  'pictur',
  'imag',
  'background',
  'collag',
  'camera'],
 ['robot',
  'transform',
  'fli',
  'war',
  'fight',
  'battl',
  'car',
  'futurist',
  'citi'],
 ['video',
  'maker',
  'chat',
  'creat',
  'audio',
  'player',
  'edit',
  'download',
  'effect'],
 ['keyboard',
  'theme',
  'emoji',
  'type',
  'phone',
  'android',
  'instal',
  'galaxi',
  'font'],
 ['color',
  'book',
  'page',
  'number',
  'girl',
  'paint',
  'glitter',
  'beauti',
  'kid'],
 ['truck',
  'drive',
  'transport',
  'simul',
  'cargo',
  'driver',
  'offroad',
  'road',
  'armi'],
 ['shoot',
  'gun',
  'fp',
  'sniper',
  'shooter',
  'mission',
  'commando',
  'terrorist',
  'armi'],
 ['bike',
  'race',
  'stunt',
  'ramp',
  'track',
  'imposs',
  'drive',
  'moto',
  'ex

In [16]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [17]:
df_analysis[df_analysis['topic'] == 20].sample(60)

Unnamed: 0,title,stemmed,topic
11246,Volleyball Championship 2014,volleybal championship 2014 is a full-scal wor...,20
16655,Iron Stickman Rope Hero Gangstar Crime,stickman rope hero vega crime simul iron man t...,20
1513,Brazilian Dama - Online,brazilian dama also known as draught or checke...,20
3833,King of Crabs,intens multiplay fun battl with up to 100 real...,20
1165,1to50,1to50 is an easi and fun game of touch from 1 ...,20
7589,Soda Dungeon 2,your favorit fizzi dungeon crawler is back the...,20
7929,Gods and Glory: War for the Throne,launch yourself into an intens tactic real-tim...,20
9976,Circuroid,circuroid is an uniqu fast-pac arcad shooter w...,20
167,Idle Space Miner - Simulator & Tycoon & Manage...,welcom to the most excit idl space miner - sim...,20
4125,Crazy Snake,immers yourself into 8-bit ambianc of old scho...,20


In [18]:
doc_topic_nmf = pd.DataFrame(doc_topic.round(5),
                             index = df_analysis['title'])
doc_topic_nmf

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
World War 2: Offline Strategy,0.00000,0.00000,0.00000,0.04071,0.00000,0.00000,0.00000,0.00000,0.03474,0.00000,...,0.11817,0.00067,0.02801,0.00000,0.00010,0.03944,0.00000,0.00000,0.00000,0.00099
"All PDF - PDF Reader, PDF Viewer & PDF Converter",0.00377,0.00000,0.01095,0.00000,0.00000,0.00000,0.09151,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.04383,0.00000,0.23065,0.06018,0.00000,0.00000,0.00000
MSN Sports - Scores & Schedule,0.01579,0.00678,0.00000,0.00000,0.04138,0.00000,0.00047,0.00000,0.00066,0.00346,...,0.07839,0.00329,0.01704,0.02704,0.00000,0.00000,0.00721,0.02443,0.00133,0.02104
QRbot: QR & barcode reader,0.04767,0.00000,0.00253,0.00000,0.00000,0.00000,0.01372,0.01026,0.00000,0.00000,...,0.00000,0.00000,0.29019,0.01342,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
QR & Barcode Scanner,0.03866,0.00000,0.00311,0.00000,0.00000,0.00258,0.05150,0.00000,0.00054,0.00153,...,0.02051,0.01498,0.05639,0.01222,0.00156,0.00083,0.01209,0.00018,0.00296,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Aksara Jawa - Nulis Aksara Jawa | Ketik & Konversi,0.00000,0.00000,0.00000,0.00000,0.00212,0.00026,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00740,0.00000,0.00000,0.00000,0.00000,0.00000,0.02653
Mp3 Songs Download,0.00153,0.00160,0.00000,0.00000,0.00000,0.00220,0.00000,0.00066,0.00000,0.03992,...,0.00000,0.00000,0.04694,0.15258,0.05232,0.12104,0.00000,0.06906,0.00000,0.04732
PDF Maker,0.01593,0.00000,0.30602,0.00000,0.02724,0.00000,0.00000,0.00000,0.00059,0.00000,...,0.00000,0.00000,0.00000,0.04384,0.05278,0.10124,0.00191,0.00000,0.01374,0.00000
DSLR Blur Photo,0.02296,0.00000,0.10405,0.00035,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.04944,0.00000,0.04573,0.00134,0.00000,0.00000,0.00000


In [19]:
'Mp3 Songs Download' in df_analysis['title']

False

### First recommender

In [20]:
def recommend_app(search):
    app = [search]
    tt = nmf_model.transform(vectorizer.transform(app))
    rec_array = pairwise_distances(tt,doc_topic,metric='cosine').argsort()
    recs = rec_array[0]
    name = df_analysis.loc[recs]
    
    return name[0:5]

In [21]:
recommend_app('airline flights')

Unnamed: 0,title,stemmed,topic
12053,Hotels and Flights,find the best deal from 800 airlin and 1200000...,18
13762,FabHotels: Safe Rooms on Best Hotel Booking App,download fabhotel onlin hotel book app book an...,6
6023,Choice Hotels,travel connect with the choic hotel app everyt...,0
298,"Almosafer: Hotels, Flights and Holidays",whether your plan a famili holiday busi trip g...,0
15522,Treebo: Hotel Booking App | Book Safe Stays,treebo hotel book safe stay with the best hote...,6


In [22]:
recommend_app('play card games')

Unnamed: 0,title,stemmed,topic
7443,TriPeaks Solitaire Challenge,play the popular solitair game tripeak for fre...,0
17085,"WEWIN (Weme, beme) Vietnam's national card game",vietnames card game all vietnames tradit card ...,0
10421,mobile-pocket loyalty cards wallet,mobile-pocket wallet is the easiest way to put...,0
7299,FreeCell,freecel is a classic and popular card gamego o...,0
10419,Cards - Mobile Wallet,some of our user may experi connect issu we ar...,0


In [23]:
recommend_app('learn to speak hindi')

Unnamed: 0,title,stemmed,topic
14864,My Korean Teacher : Quiz,my korean teacher is a game to learn korean th...,29
7603,"Learn reading, speaking English for Kids - BiBo",bibo speak english is an educ applic design fr...,29
7599,Studycat: Learn English for Kids,learn english is fun with studycatour educ lan...,29
4893,German Language Learning - Busuu,were busuu were an app that make learn a langu...,29
4899,Parla: Learn Spanish Free,parla x is an app for learn languag fast join ...,29


In [24]:
recommend_app('algebra lessons')

Unnamed: 0,title,stemmed,topic
6915,"Baby tracker - feeding, sleep and diaper",babi tracker- breastfeed and babi feed tracker...,14
13333,MentalUP - Learning Games & Brain Games,kid want to play mentalup educ learn game brin...,14
15925,Baby Hazel Tomato Farming,kid can play babi hazel tomato farm game for f...,14
7244,Math Master Educational Game and Brain Workout,welcom to the offici math mentor app from pari...,14
4569,Brilliant,featur in the atlant the new york time npr and...,14


In [25]:
recommend_app('convert pdf quickly')

Unnamed: 0,title,stemmed,topic
13879,Doc to PDF Converter (xls ppt word png jpg csv...,ani one can easili convert hisher document fil...,17
10370,Explorer+ File Manager,explor a practic no-nonsens file manager- brow...,17
13372,PDF File Reader,docdocx viewer integratedbest pdf reader for 2...,17
11982,File Explorer,a simpl file explor with basic function for th...,17
10652,"WPS PDF - Free For PDF Scan, Read, Edit, Convert",featur of wp pdf readerif you love read pdf an...,17


In [26]:
recommend_app('first person shooter')

Unnamed: 0,title,stemmed,topic
11027,Train shooting - Zombie War,train shoot - zombi war come with a highli add...,8
5384,Raccoon Bubbles,shoot bubbl ha never been more relax and funhe...,8
14305,Call of Modern World War Hero Sniper Assassin,here you will becom the best sniper of the wor...,8
2813,Gun Master 3: Zombie Slayer,hone your shoot skill in the militari grade ar...,8
9101,Pocket Basketball,pocket basketbal is a simpl yet challeng baske...,8


In [27]:
recommend_app('pet dog')

Unnamed: 0,title,stemmed,topic
15531,Kids Policeman games: Hippo Detective,a crimin world is in action again it mean that...,19
11385,Police Dog Attack Prison Break,rivalri between prison and polic game take a n...,19
14532,Police Dog Airport Crime Chase : Dog Games,the secur of the airport is in your hand now t...,19
4976,Flying Horse Police Chase : US Police Horse Games,fli polic hors chase crimin with fli hors ride...,19
15050,Police Scanner X,with polic scanner x you can listen to policef...,19


In [28]:
recommend_app('meet other singles')

Unnamed: 0,title,stemmed,topic
13295,Online Girls Chat Meet,top featur of onlin girl free download app fre...,15
1745,"Hello - Talk, Chat & Meet",alreadi 500000000 call and 250000000 match and...,15
16666,Stranger chat: meet new people,fed up with your current random chat unknown c...,15
2431,"Mashi - Free Voice Chat Rooms , Party in the Room",mashi the no1 local free voic chat room app in...,15
799,PAGO - Qwikmatch Live chat for Among Us,be socialpago is the newest version of qwikmat...,15


In [29]:
recommend_app('space simulator')

Unnamed: 0,title,stemmed,topic
371,MapleStory M - Open World MMORPG,maplestori m the highli anticip follow-up to m...,20
8235,HEIR OF LIGHT,a dark fantasi collect rpgdark ha overtaken th...,20
6503,CastleStorm - Free to Siege,welcom to the world of castlestorm - free to s...,20
16831,Illuminati vs. Memes MLG,get readi 2 gr8 illuminati battl against memes...,20
2786,Fantasy War Tactics R,what newpatch note new expedi area geumhwa pal...,20


In [30]:
recommend_app('mobile banking')

Unnamed: 0,title,stemmed,topic
5278,POSB digibank,everyday bank made simple-peek at account bala...,0
5280,DBS digibank SG,everyday bank made simple-peek at account bala...,0
5803,LogDog - Mobile Security 2021,trust by million featur on pc magazin techcrun...,0
1126,PassKeep - offline password manager,passkeep is the easiest and the most secur way...,22
1571,Standard Bank / Stanbic Bank,the standard bank app give you full visibl of ...,0


In [31]:
recommend_app('war strategy')

Unnamed: 0,title,stemmed,topic
7080,Cat Shooting War: Offline Gunner TD Battles,protect your own felin kingdom from evil fight...,20
12143,Tank Attack Blitz: Panzer War Machines,welcom to tank attack blitz panzer war game th...,20
130,Cyber Dead: Metal Zombie Shooting Super Squad,the dead aris and mix with cyber you mean unde...,20
4042,Call of Craft: Blocky Tanks Battlefield,welcom to the world of the tank and awesom gun...,20
5138,Dead Invaders: FPS Shooting Game & Modern War 3D,becom last line of defens and save mankind fro...,20


In [32]:
recommend_app('make new friends')

Unnamed: 0,title,stemmed,topic
3383,Haya - Group Voice Chat App,haya fun voic social chat togeth gather friend...,26
482,Yokai Tamer,yokai tamer is a super popular japanes style m...,20
3946,Hello Kitty Friends,match 2 blast puzzl game from superawesom lead...,26
2432,Sango - Free Live Group Voice Chat Rooms,sango is an interact live group voic chat and ...,20
8301,Gaia Odyssey,gaia odyssey is a 3d action rpg that give you ...,20


In [33]:
recommend_app('hindi translation')

Unnamed: 0,title,stemmed,topic
4686,dict.cc dictionary,dictionari for 51 languag combin usabl without...,29
12360,English - Hindi Translator,learn hindi or visit a hindi speak countri eas...,29
326,Portuguese English Dictionary & Translator Free,easili learn portugues with portugues english ...,29
9516,Turkish English Dictionary & Translator Free,easili learn turkish english with turkish engl...,29
4992,Spanish English Dictionary & Translator Free,easili learn spanish english with spanish engl...,29


In [34]:
recommend_app('listen to music')

Unnamed: 0,title,stemmed,topic
14245,Music Player For Samsung,music player for samsung galaxi - s11 music pl...,13
2320,Lowriders Comeback 2 : Russia,the biggest music game about lowridersfeatures...,13
16172,Equalizer Pro & Bass Booster,equal pro bass booster with easy-to-us home wi...,13
13338,Free Music for SoundCloud,million of trendinghottop music all free downl...,13
3535,PowerAudio Pro Music Player,poweraudio music player is the mostpow music p...,13


In [35]:
recommend_app('piano lessons')

Unnamed: 0,title,stemmed,topic
12050,Music Kids - Songs & Music Instruments,music kid is a great fun music box creat espec...,13
7204,Lullabies Relax & Sleep Baby,want your babi to fall asleep quickli just cho...,13
7102,Music box to sleep,your babi can not sleep cri a lot whi do not y...,13
7103,Sound to children sleep,a beauti sound to the parent put their babi to...,13
7955,Baby Zoo Piano with Music for Toddlers and Kids,babi zoo piano is an excel entertain and educ ...,14


In [36]:
recommend_app('secure vpn app')

Unnamed: 0,title,stemmed,topic
11348,Free WiFi Internet - Data Usage Monitor,app offer updat premium featur for free who us...,22
10828,WiFi Router Scanner - Who is on my WiFi?,wifi router scanner - who is on my wifi is pow...,22
11911,Chart signals & Network speed test 3g 4g 5g Wi-Fi,chart signal network speed test 3g 4g 5g wi-fi...,22
5766,WiFi Speed Test - WiFi Signal Strength Meter,wifi speed test - wifi signal strength meter i...,22
2116,Who is on my WiFi - Network Scanner & WiFi Sca...,who is on my wifi - network scanner wifi scann...,22


In [64]:
df_short = df[['title','description','summary','score','free','genre','icon','year','contentRating']].copy()

In [65]:
df_merged = pd.merge(
                    df_short,
                    doc_topic_nmf,
                    how="inner",
                    on='title',
                    left_index=False,
                    right_index=False,
                    sort=True,
                    suffixes=("_x", "_y"),
                    copy=True,
                    indicator=False,
                    validate=None,
)

In [66]:
with open('df_apps_topics.pkl', 'wb') as f:
    pickle.dump(df_merged, f)

In [56]:
with open('nmf_model.pkl', 'wb') as f:
    pickle.dump(nmf_model, f)

In [57]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [58]:
with open('doc_topic_array.pkl', 'wb') as f:
    pickle.dump(doc_topic, f)

In [37]:
#Alarm Adzan Otomatis Muslim Indonesia 2020 : T...
#Aksara Jawa - Nulis Aksara Jawa | Ketik & Konv...
#Lotto Results - Mega Millions Powerball Lotter...