In [1]:
import pandas as pd
from pandas import option_context
import numpy as np
import re
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, PCA
from sklearn.metrics import pairwise_distances

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [2]:
df = pd.read_csv('04-data/preprocessed_app_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22015 entries, 0 to 22014
Data columns (total 56 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     22015 non-null  object 
 1   description               22015 non-null  object 
 2   summary                   22014 non-null  object 
 3   installs                  22015 non-null  object 
 4   minInstalls               22015 non-null  float64
 5   score                     22015 non-null  float64
 6   ratings                   22015 non-null  float64
 7   reviews                   22015 non-null  float64
 8   histogram                 22015 non-null  object 
 9   price                     22015 non-null  float64
 10  free                      22015 non-null  int64  
 11  currency                  22015 non-null  object 
 12  sale                      22015 non-null  bool   
 13  offersIAP                 22015 non-null  bool   
 14  inAppP

In [3]:
# Filter to higher scoring apps to provide better recommendations for users
df = df[df['description_clean'].notna()]
df = df[df['score'] >= 3.8]

In [4]:
df_analysis = df[['title','description_clean']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [5]:
df_analysis

Unnamed: 0,title,description_clean
0,World War 2: Offline Strategy,command allies in 25 epic world war 2 locatio...
1,"All PDF - PDF Reader, PDF Viewer & PDF Converter",pdf reader for android pdfs converter free is ...
2,MSN Sports - Scores & Schedule,be in a league of your ownget real-time game u...
3,QRbot: QR & barcode reader,scan all kinds of qr codes and barcodes with t...
4,QR & Barcode Scanner,qr barcode scannerthis qr barcode scanner wi...
...,...,...
17136,Aksara Jawa - Nulis Aksara Jawa | Ketik & Konv...,aksara jawa adalah salah satu aset budaya indo...
17137,Mp3 Songs Download,the application provides search stream and dow...
17138,PDF Maker,main features of orangepalm s pdf maker app- s...
17139,DSLR Blur Photo,this app lets you blur parts of your photo whi...


In [6]:
stop = stopwords.words('english')
stop.extend(['free','new','get','hd','use','game','games','make','makes','play',
             'fun','features', 'need','live', 'also','using','best','us','app','apps', 'one', '2020','2021',
            'this','like','enjoy','thing','free'])

# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])

# # Extend for most common irregular verbs (except pay,lose,send,buy,spend)

# stop.extend(['say','make','go','take','come','see','know','get','got','give',
#             'find','think','tell','show','leave','feel','put','bring',
#             'begin','keep','hold','stand','hear','let','mean','set',
#             'sit','lie','lead','fall','break','rise','drive','choose'])

# ## Extend for prepositions
stop.extend(['without','among'])

## Stemmed

In [7]:
df['tokens'] = df.description_clean.apply(nltk.word_tokenize)

In [8]:
stemmer = PorterStemmer()

df['stemmed'] = df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

In [9]:
df.tokens = df.tokens.apply(lambda x: ' '.join(x))

In [10]:
df.stemmed = df.stemmed.apply(lambda x: ' '.join(x))

In [11]:
df_analysis = df[['title','stemmed']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [12]:
df_analysis

Unnamed: 0,title,stemmed
0,World War 2: Offline Strategy,command alli in 25 epic world war 2 locat we h...
1,"All PDF - PDF Reader, PDF Viewer & PDF Converter",pdf reader for android pdf convert free is one...
2,MSN Sports - Scores & Schedule,be in a leagu of your ownget real-tim game upd...
3,QRbot: QR & barcode reader,scan all kind of qr code and barcod with the q...
4,QR & Barcode Scanner,qr barcod scannerthi qr barcod scanner will le...
...,...,...
17136,Aksara Jawa - Nulis Aksara Jawa | Ketik & Konv...,aksara jawa adalah salah satu aset budaya indo...
17137,Mp3 Songs Download,the applic provid search stream and download a...
17138,PDF Maker,main featur of orangepalm s pdf maker app- sel...
17139,DSLR Blur Photo,thi app let you blur part of your photo which ...


In [13]:
vectorizer = CountVectorizer(stop_words = stop)

doc_word = vectorizer.fit_transform(df_analysis.stemmed)



In [14]:
nmf_model = NMF(30)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_

In [15]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['card',
  'time',
  'workout',
  'account',
  'help',
  'day',
  'ani',
  'creat',
  'featur'],
 ['car', 'drive', 'race', 'park', 'drift', 'stunt', 'simul', 'extrem', 'real'],
 ['photo',
  'frame',
  'editor',
  'effect',
  'pictur',
  'imag',
  'background',
  'collag',
  'camera'],
 ['robot',
  'transform',
  'fli',
  'war',
  'fight',
  'battl',
  'car',
  'futurist',
  'citi'],
 ['video',
  'maker',
  'chat',
  'creat',
  'audio',
  'player',
  'edit',
  'download',
  'effect'],
 ['keyboard',
  'theme',
  'emoji',
  'type',
  'font',
  'languag',
  'cute',
  'arab',
  'note'],
 ['color',
  'book',
  'page',
  'number',
  'girl',
  'paint',
  'glitter',
  'beauti',
  'kid'],
 ['truck',
  'drive',
  'transport',
  'simul',
  'cargo',
  'driver',
  'offroad',
  'road',
  'armi'],
 ['shoot',
  'gun',
  'fp',
  'sniper',
  'shooter',
  'mission',
  'commando',
  'terrorist',
  'armi'],
 ['bike',
  'race',
  'stunt',
  'ramp',
  'track',
  'imposs',
  'drive',
  'moto',
  'extrem'],
 [

In [16]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [17]:
df_analysis[df_analysis['topic'] == 20].sample(60)

Unnamed: 0,title,stemmed,topic
2855,Fitness Gym Bodybuilding Pump,are you fond of fit and bodybuildingdo you dre...,20
3805,Harvest Town,harvest town is a simul mobil game with pixel ...,20
5131,Gladihoppers - Gladiator Battle Simulator!,hop into the sandal of a gladiat and fight for...,20
7122,Carrier Landings,land on an aircraft carrier is one of the most...,20
7938,Devil Twins: VIP,vip mode effect1 soul orb obtain 20 forever2 d...,20
1693,Ninja Assassin Hero - Gangster Fighting Games ...,get readi to play the most epic ninja battlegr...,20
2141,Toys And Me - Bubble Pop,toy and me - bubbl popaim fire pop play the br...,20
2778,Piggy GO - Clash of Coin,welcom to piggi go roll your dice and travel a...,20
12819,Another Stickman Platform 3: The Ninja Simulator,parkour stickman game run and climb like a nin...,20
15786,Hanger World - Rope Swing,swing around in the best stickman rope swing g...,20


In [18]:
doc_topic_nmf = pd.DataFrame(doc_topic.round(5),
                             index = df.index)
doc_topic_nmf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.00000,0.00000,0.00000,0.04051,0.00000,0.00000,0.00000,0.00000,0.03485,0.00000,...,0.11463,0.00281,0.02935,0.00000,0.00101,0.04113,0.00166,0.00000,0.00000,0.00405
2,0.00387,0.00000,0.01341,0.00000,0.00000,0.00000,0.09414,0.00000,0.00013,0.00000,...,0.00000,0.00000,0.00000,0.04494,0.00000,0.21796,0.06242,0.00000,0.00000,0.00000
3,0.05507,0.00683,0.00000,0.00000,0.04199,0.00000,0.00000,0.00000,0.00302,0.00402,...,0.05325,0.00000,0.00357,0.01553,0.00000,0.00000,0.00039,0.00376,0.00000,0.01198
4,0.07705,0.00000,0.00096,0.00000,0.00000,0.00000,0.01249,0.00924,0.00000,0.00000,...,0.00000,0.00000,0.26577,0.00499,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
5,0.04367,0.00000,0.00281,0.00000,0.00000,0.00554,0.05128,0.00000,0.00087,0.00087,...,0.01185,0.01157,0.04316,0.00907,0.00013,0.00000,0.00750,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22006,0.00000,0.00000,0.00000,0.00000,0.00215,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00031,0.00749,0.00000,0.00000,0.00000,0.00471,0.00000,0.02802
22007,0.04270,0.00117,0.00000,0.00000,0.00000,0.01371,0.00000,0.00114,0.00000,0.04046,...,0.00000,0.00000,0.03345,0.14533,0.04717,0.11161,0.00000,0.00000,0.00000,0.04237
22011,0.00000,0.00000,0.30853,0.00000,0.02869,0.00000,0.00000,0.00000,0.00129,0.00000,...,0.00000,0.00000,0.00000,0.04715,0.06733,0.08847,0.00638,0.00000,0.01889,0.00000
22012,0.01362,0.00000,0.10442,0.00031,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.04841,0.00077,0.04383,0.00061,0.00000,0.00000,0.00000


### Recommender

In [20]:
def recommend_app(search):
    app = [search]
    tt = nmf_model.transform(vectorizer.transform(app))
    rec_array = pairwise_distances(tt.reshape(1,-1),doc_topic,metric='cosine').argsort()
    recs = list(rec_array[0][0:20])
    name = df_analysis.loc[recs]
    
    return name[0:5]

In [21]:
recommend_app('airline flights')

Unnamed: 0,title,stemmed,topic
298,"Almosafer: Hotels, Flights and Holidays",whether your plan a famili holiday busi trip g...,0
11941,Tiket Kereta Api - Tiket KAI,an applic from indonesia for indonesia tiket k...,0
3155,Cebu Pacific,cebu pacif air just gocebu pacif is the larges...,0
2330,Simple Loan Calculator,the simpl calcul for annuiti differenti and fi...,0
5418,Cashback service Megabonus,get up to 40 of the price of onlin purchas in ...,0


In [22]:
recommend_app('play card games')

Unnamed: 0,title,stemmed,topic
15769,AEON THAI MOBILE,main fuction card transact and avail limit to ...,0
17085,"WEWIN (Weme, beme) Vietnam's national card game",vietnames card game all vietnames tradit card ...,0
2840,Durak,implement of durak card game whi is it better1...,0
10419,Cards - Mobile Wallet,some of our user may experi connect issu we ar...,0
7299,FreeCell,freecel is a classic and popular card gamego o...,0


In [23]:
recommend_app('learn to speak hindi')

Unnamed: 0,title,stemmed,topic
14864,My Korean Teacher : Quiz,my korean teacher is a game to learn korean th...,29
7603,"Learn reading, speaking English for Kids - BiBo",bibo speak english is an educ applic design fr...,29
4893,German Language Learning - Busuu,were busuu were an app that make learn a langu...,29
9442,Learn French Fast: French Course,so you want to learn french in no time you nee...,29
7599,Studycat: Learn English for Kids,learn english is fun with studycatour educ lan...,29


In [24]:
recommend_app('algebra lessons')

Unnamed: 0,title,stemmed,topic
7746,Preschool All-In-One,preschool all-in-on is great for kid 2 - 5 yea...,14
8436,Achikaps,achikap is a simpl econom strategi features- e...,14
143,ABC KIDS,free educ app to learn english letter number t...,14
13333,MentalUP - Learning Games & Brain Games,kid want to play mentalup educ learn game brin...,14
15619,Learn Alphabet for Kids with Marbel,marbel alphabet - an educ applic that help kid...,14


In [25]:
recommend_app('convert pdf quickly')

Unnamed: 0,title,stemmed,topic
13879,Doc to PDF Converter (xls ppt word png jpg csv...,ani one can easili convert hisher document fil...,17
13372,PDF File Reader,docdocx viewer integratedbest pdf reader for 2...,17
10370,Explorer+ File Manager,explor a practic no-nonsens file manager- brow...,17
11982,File Explorer,a simpl file explor with basic function for th...,17
10652,"WPS PDF - Free For PDF Scan, Read, Edit, Convert",featur of wp pdf readerif you love read pdf an...,17


In [26]:
recommend_app('first person shooter')

Unnamed: 0,title,stemmed,topic
8476,Archery Ace,an incred new archeri game is wait for you arc...,8
5203,Battlefield Ops: 3D Free FPS Shooter & Strike ...,best 3d fp with never-end fire and shoot actio...,8
2055,"Idle Shooting Target: Best Gun Sound, Sniper F...",are you readi to be a shooter come and experi ...,8
12882,Counter Commando Strike - New Action Strike Game,counter commando strike - new action strike ga...,8
9197,Gentle Sniper,everyonebeliev themselv they are the best snip...,8


In [27]:
recommend_app('pet dog')

Unnamed: 0,title,stemmed,topic
15531,Kids Policeman games: Hippo Detective,a crimin world is in action again it mean that...,19
14532,Police Dog Airport Crime Chase : Dog Games,the secur of the airport is in your hand now t...,19
11385,Police Dog Attack Prison Break,rivalri between prison and polic game take a n...,19
4976,Flying Horse Police Chase : US Police Horse Games,fli polic hors chase crimin with fli hors ride...,19
15050,Police Scanner X,with polic scanner x you can listen to policef...,19


In [28]:
recommend_app('meet other singles')

Unnamed: 0,title,stemmed,topic
13295,Online Girls Chat Meet,top featur of onlin girl free download app fre...,15
1745,"Hello - Talk, Chat & Meet",alreadi 500000000 call and 250000000 match and...,15
16666,Stranger chat: meet new people,fed up with your current random chat unknown c...,15
944,Bolo - Adult Chat bigo hot girl video call app hd,bolo is a pure stranger video chat secret onli...,15
2431,"Mashi - Free Voice Chat Rooms , Party in the Room",mashi the no1 local free voic chat room app in...,15


In [29]:
recommend_app('space simulator')

Unnamed: 0,title,stemmed,topic
6503,CastleStorm - Free to Siege,welcom to the world of castlestorm - free to s...,20
16831,Illuminati vs. Memes MLG,get readi 2 gr8 illuminati battl against memes...,20
8235,HEIR OF LIGHT,a dark fantasi collect rpgdark ha overtaken th...,20
371,MapleStory M - Open World MMORPG,maplestori m the highli anticip follow-up to m...,20
14136,Freecell King,features- stage mode 450 stages- classic mode-...,11


In [30]:
recommend_app('mobile banking')

Unnamed: 0,title,stemmed,topic
5803,LogDog - Mobile Security 2021,trust by million featur on pc magazin techcrun...,0
4642,Fastlink,4g 1- 2- 3- 4- 5- 6- 4g lte 1- 2- 3- 4- 5- 6- ...,0
5626,Paychex Flex,the paychex flex app can be use by previous re...,0
12096,Flip: Transfer Antar Bank Tanpa Biaya Admin,dengan flip transfer uang ke beda bank bisa gr...,22
10521,Jira Cloud by Atlassian,organ everyth track task fix bug answer reques...,0


In [31]:
recommend_app('war strategy')

Unnamed: 0,title,stemmed,topic
130,Cyber Dead: Metal Zombie Shooting Super Squad,the dead aris and mix with cyber you mean unde...,20
7080,Cat Shooting War: Offline Gunner TD Battles,protect your own felin kingdom from evil fight...,20
12143,Tank Attack Blitz: Panzer War Machines,welcom to tank attack blitz panzer war game th...,20
5138,Dead Invaders: FPS Shooting Game & Modern War 3D,becom last line of defens and save mankind fro...,20
5709,Last Human Life on Earth,in 2035 the world saw a plagu that kill most o...,20


In [32]:
recommend_app('make new friends')

Unnamed: 0,title,stemmed,topic
9162,CatFish,hey what up you look like you could use a litt...,20
3383,Haya - Group Voice Chat App,haya fun voic social chat togeth gather friend...,26
2003,Quidd: Digital Collectibles,rememb trade card theyr back in digit formbuil...,26
4608,Wink - find & make new friends,wink unlimit friend wink is the best place to ...,26
2432,Sango - Free Live Group Voice Chat Rooms,sango is an interact live group voic chat and ...,26


In [33]:
recommend_app('hindi translation')

Unnamed: 0,title,stemmed,topic
12360,English - Hindi Translator,learn hindi or visit a hindi speak countri eas...,29
14038,Offline Urdu Lughat - Urdu to Urdu Dictionary,offlin urdu lughat is the first total offlin a...,29
4686,dict.cc dictionary,dictionari for 51 languag combin usabl without...,29
16274,Mahir Bahasa Inggris untuk Pemula,aplikasi mahir bahasa inggri sehari-hari adala...,29
14185,Bangla Keyboard - English to Bangla Typing,infin app studio bring you the new best free b...,29


In [34]:
recommend_app('listen to music')

Unnamed: 0,title,stemmed,topic
2320,Lowriders Comeback 2 : Russia,the biggest music game about lowridersfeatures...,13
16172,Equalizer Pro & Bass Booster,equal pro bass booster with easy-to-us home wi...,13
13338,Free Music for SoundCloud,million of trendinghottop music all free downl...,13
14245,Music Player For Samsung,music player for samsung galaxi - s11 music pl...,13
3535,PowerAudio Pro Music Player,poweraudio music player is the mostpow music p...,13


In [35]:
recommend_app('piano lessons')

Unnamed: 0,title,stemmed,topic
12050,Music Kids - Songs & Music Instruments,music kid is a great fun music box creat espec...,13
7145,Nature sounds relax & sleep,you want to relax sleep better and increas con...,13
7204,Lullabies Relax & Sleep Baby,want your babi to fall asleep quickli just cho...,13
7102,Music box to sleep,your babi can not sleep cri a lot whi do not y...,13
7955,Baby Zoo Piano with Music for Toddlers and Kids,babi zoo piano is an excel entertain and educ ...,14


In [36]:
recommend_app('secure vpn app')

Unnamed: 0,title,stemmed,topic
11348,Free WiFi Internet - Data Usage Monitor,app offer updat premium featur for free who us...,22
5766,WiFi Speed Test - WiFi Signal Strength Meter,wifi speed test - wifi signal strength meter i...,22
10828,WiFi Router Scanner - Who is on my WiFi?,wifi router scanner - who is on my wifi is pow...,22
10210,WiFi Manager - WiFi Network Analyzer & Speed Test,wifi manag - wifi network analyz boost wifi sp...,22
15037,TownWiFi by GMO | WiFi Everywhere,you can connect and authent to over 2m public ...,22


In [37]:
df_short = df[['title','description','summary','url','screenshots','score','free','genre','icon','year','contentRating']].copy()

In [52]:
df_short.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17141 entries, 0 to 22014
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          17141 non-null  object 
 1   description    17141 non-null  object 
 2   summary        17141 non-null  object 
 3   url            17141 non-null  object 
 4   screenshots    17141 non-null  object 
 5   score          17141 non-null  float64
 6   free           17141 non-null  int64  
 7   genre          17141 non-null  object 
 8   icon           17141 non-null  object 
 9   year           17141 non-null  int64  
 10  contentRating  17141 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 1.6+ MB


In [38]:
df_merged = pd.concat([df_short,doc_topic_nmf],axis=1)

In [39]:
df_merged

Unnamed: 0,title,description,summary,url,screenshots,score,free,genre,icon,year,...,20,21,22,23,24,25,26,27,28,29
0,World War 2: Offline Strategy,Command allies in 25 epic World War 2 locatio...,Historic events of World War 2,https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/IBX59j...,4.147644,1,Strategy,https://play-lh.googleusercontent.com/yP4iAlvX...,2018,...,0.11463,0.00281,0.02935,0.00000,0.00101,0.04113,0.00166,0.00000,0.00000,0.00405
2,"All PDF - PDF Reader, PDF Viewer & PDF Converter","PDF Reader for android, PDFs converter free is...","PDF Reader, PDF Reader 2021 Converter. Reduce ...",https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/gAJf_N...,4.376968,1,Books & Reference,https://play-lh.googleusercontent.com/OZCfUPN4...,2018,...,0.00000,0.00000,0.00000,0.04494,0.00000,0.21796,0.06242,0.00000,0.00000,0.00000
3,MSN Sports - Scores & Schedule,Be in a league of your own\r\n\r\nGet real-tim...,"Be in a league of your own. Get scores, news, ...",https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/a2wvAV...,4.234083,1,Sports,https://play-lh.googleusercontent.com/ywPbVWLi...,2014,...,0.05325,0.00000,0.00357,0.01553,0.00000,0.00000,0.00039,0.00376,0.00000,0.01198
4,QRbot: QR & barcode reader,Scan all kinds of QR codes and barcodes with t...,Functional QR scanner and barcode reader that ...,https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/ZtF-o_...,4.548066,1,Tools,https://play-lh.googleusercontent.com/zg-T79Tw...,2015,...,0.00000,0.00000,0.26577,0.00499,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
5,QR & Barcode Scanner,QR & BARCODE SCANNER\r\n\r\nThis QR & Barcode ...,Scanner on your fingertip,https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/MNEJK4...,4.183137,1,Productivity,https://play-lh.googleusercontent.com/g1h9CNtS...,2019,...,0.01185,0.01157,0.04316,0.00907,0.00013,0.00000,0.00750,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22006,Aksara Jawa - Nulis Aksara Jawa | Ketik & Konv...,Aksara Jawa adalah salah satu aset budaya Indo...,Learn Javanese Script on HP: Convert writing t...,https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/q5dFKm...,4.383142,1,Education,https://play-lh.googleusercontent.com/G6ws-Fkf...,2015,...,0.00000,0.00000,0.00031,0.00749,0.00000,0.00000,0.00000,0.00471,0.00000,0.02802
22007,Mp3 Songs Download,"The application provides search, stream, and d...",Make a moment Happy,https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/1h95pS...,4.216473,1,Music & Audio,https://play-lh.googleusercontent.com/wiHyHsXa...,2017,...,0.00000,0.00000,0.03345,0.14533,0.04717,0.11161,0.00000,0.00000,0.00000,0.04237
22011,PDF Maker,Main features of OrangePalm 's PDF Maker app.\...,"Create PDF files on your device for images, do...",https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/WKwVpV...,4.259011,1,Productivity,https://play-lh.googleusercontent.com/nKYYm0as...,2020,...,0.00000,0.00000,0.00000,0.04715,0.06733,0.08847,0.00638,0.00000,0.01889,0.00000
22012,DSLR Blur Photo,This App lets you blur parts of your photo whi...,Now blur your picture's background just by tou...,https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/RA5nfz...,3.829474,1,Photography,https://play-lh.googleusercontent.com/CWS2z7zD...,2016,...,0.00000,0.00000,0.00000,0.04841,0.00077,0.04383,0.00061,0.00000,0.00000,0.00000


In [40]:
df_merged.title.isnull().values.any()

False

In [66]:
df_merged['screenshot_list'] = df_merged.screenshots.apply(lambda x: x[1:-1].split(','))
df_merged['screenshot_list'] = df_merged['screenshot_list'][0:4]

In [71]:
df_merged

Unnamed: 0,title,description,summary,url,screenshots,score,free,genre,icon,year,...,24,25,26,27,28,29,screenshot_short,screenshot_len,screenshot_list,len
0,World War 2: Offline Strategy,Command allies in 25 epic World War 2 locatio...,Historic events of World War 2,https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/IBX59j...,4.147644,1,Strategy,https://play-lh.googleusercontent.com/yP4iAlvX...,2018,...,0.00101,0.04113,0.00166,0.00000,0.00000,0.00405,['https://play-lh.googleusercontent.com/IBX59j...,436,['https://play-lh.googleusercontent.com/IBX59j...,436
2,"All PDF - PDF Reader, PDF Viewer & PDF Converter","PDF Reader for android, PDFs converter free is...","PDF Reader, PDF Reader 2021 Converter. Reduce ...",https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/gAJf_N...,4.376968,1,Books & Reference,https://play-lh.googleusercontent.com/OZCfUPN4...,2018,...,0.00000,0.21796,0.06242,0.00000,0.00000,0.00000,,546,['https://play-lh.googleusercontent.com/gAJf_N...,546
3,MSN Sports - Scores & Schedule,Be in a league of your own\r\n\r\nGet real-tim...,"Be in a league of your own. Get scores, news, ...",https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/a2wvAV...,4.234083,1,Sports,https://play-lh.googleusercontent.com/ywPbVWLi...,2014,...,0.00000,0.00000,0.00039,0.00376,0.00000,0.01198,,1743,['https://play-lh.googleusercontent.com/a2wvAV...,1743
4,QRbot: QR & barcode reader,Scan all kinds of QR codes and barcodes with t...,Functional QR scanner and barcode reader that ...,https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/ZtF-o_...,4.548066,1,Tools,https://play-lh.googleusercontent.com/zg-T79Tw...,2015,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,,1307,['https://play-lh.googleusercontent.com/ZtF-o_...,1307
5,QR & Barcode Scanner,QR & BARCODE SCANNER\r\n\r\nThis QR & Barcode ...,Scanner on your fingertip,https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/MNEJK4...,4.183137,1,Productivity,https://play-lh.googleusercontent.com/g1h9CNtS...,2019,...,0.00013,0.00000,0.00750,0.00000,0.00000,0.00000,,437,,437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22006,Aksara Jawa - Nulis Aksara Jawa | Ketik & Konv...,Aksara Jawa adalah salah satu aset budaya Indo...,Learn Javanese Script on HP: Convert writing t...,https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/q5dFKm...,4.383142,1,Education,https://play-lh.googleusercontent.com/G6ws-Fkf...,2015,...,0.00000,0.00000,0.00000,0.00471,0.00000,0.02802,,655,,655
22007,Mp3 Songs Download,"The application provides search, stream, and d...",Make a moment Happy,https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/1h95pS...,4.216473,1,Music & Audio,https://play-lh.googleusercontent.com/wiHyHsXa...,2017,...,0.04717,0.11161,0.00000,0.00000,0.00000,0.04237,,871,,871
22011,PDF Maker,Main features of OrangePalm 's PDF Maker app.\...,"Create PDF files on your device for images, do...",https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/WKwVpV...,4.259011,1,Productivity,https://play-lh.googleusercontent.com/nKYYm0as...,2020,...,0.06733,0.08847,0.00638,0.00000,0.01889,0.00000,,435,,435
22012,DSLR Blur Photo,This App lets you blur parts of your photo whi...,Now blur your picture's background just by tou...,https://play.google.com/store/apps/details?id=...,['https://play-lh.googleusercontent.com/RA5nfz...,3.829474,1,Photography,https://play-lh.googleusercontent.com/CWS2z7zD...,2016,...,0.00077,0.04383,0.00061,0.00000,0.00000,0.00000,,438,,438


In [41]:
with open('df_apps_topics.pkl', 'wb') as f:
    pickle.dump(df_merged, f)

In [56]:
with open('nmf_model.pkl', 'wb') as f:
    pickle.dump(nmf_model, f)

In [57]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [58]:
with open('doc_topic_array.pkl', 'wb') as f:
    pickle.dump(doc_topic, f)