In [3]:
import json
import numpy as np
import pandas as pd

from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
stop_words_en = list(stopwords.words("english"))

# Load the Dataset

In [2]:
sofia_airbnb = pd.read_csv("../data/sofia_airbnb_reviews.csv")

***

# Term-Frequency x Inverse-Doc-Freq

In [11]:
for loc in sofia_airbnb.values:
    tf_idf = TfidfVectorizer(use_idf=True, stop_words=stop_words_en)
    X = tf_idf.fit_transform(json.loads(loc[2]))
    mean = np.mean(X.todense(), axis=0)
    tf_idf_scores = np.multiply(mean, tf_idf.idf_.reshape(mean.shape))
    
    words = np.array(tf_idf.get_feature_names()).reshape(mean.shape)
    zipped = np.hstack((tf_idf_scores.T, words.T)).tolist()
    print(np.array(sorted(zipped, key=lambda x: x[0], reverse=True)))
    
    print('='*100)
    print('='*100)

[['0.18019997726693912' 'great']
 ['0.16693974105045134' 'nice']
 ['0.15807689563968194' 'place']
 ['0.14398912391173857' 'perfect']
 ['0.12863950216410877' 'location']
 ['0.1261239904326111' 'host']
 ['0.1152292651010519' 'stay']
 ['0.11262514913718652' 'sofia']
 ['0.11010702834465844' 'everything']
 ['0.10893547754776185' 'good']
 ['0.10870970098990194' 'apartment']
 ['0.10815900132045818' 'super']
 ['0.10773459928060357' 'really']
 ['0.10731512460247292' 'tony']
 ['0.10519088764076279' 'people']
 ['0.10012034474439055' 'close']
 ['0.09681585511907705' 'central']
 ['0.09510642700914754' 'us']
 ['0.09461016839242467' 'cozy']
 ['0.09319772342471115' 'city']
 ['0.0925969421328269' 'vera']
 ['0.09255003796145016' 'room']
 ['0.09252115737668712' 'clean']
 ['0.09036957437282857' 'pleased']
 ['0.08733124667117649' 'heart']
 ['0.08328505289596816' 'recommended']
 ['0.07826881205793697' 'la']
 ['0.07522871607100137' 'definitely']
 ['0.07477584824149587' 'comfortable']
 ['0.07274003109690853' 

[['0.150093896020655' 'excellent']
 ['0.1420544742765579' 'highly']
 ['0.1420544742765579' 'recommended']
 ['0.13413999490486703' 'apartment']
 ['0.1327825581913776' 'nice']
 ['0.12986727861718694' 'city']
 ['0.1296334191849044' 'perfect']
 ['0.12899329816305594' 'clean']
 ['0.12830078316294222' 'elieta']
 ['0.1235303996541538' 'host']
 ['0.12344815049966082' 'us']
 ['0.1230602821053771' 'great']
 ['0.11962378407681462' 'thanks']
 ['0.11952963495182573' 'stay']
 ['0.11910156353581836' 'everything']
 ['0.11700082289431048' 'would']
 ['0.11647385866975969' 'heart']
 ['0.11410260063437969' 'helpful']
 ['0.10890534038913784' 'located']
 ['0.10824647907539629' 'und']
 ['0.10594584255983694' 'close']
 ['0.10590169902995467' 'location']
 ['0.10345932790797491' 'vest']
 ['0.10208288886175874' 'sofia']
 ['0.10192381313277157' 'center']
 ['0.10129011182580103' 'communication']
 ['0.09844927240178956' 'definitely']
 ['0.09730915080139069' 'needed']
 ['0.09606272761990754' 'return']
 ['0.093564261

[['0.1596052411821743' 'great']
 ['0.15654831175206596' 'place']
 ['0.1391021661766225' 'stay']
 ['0.12955695520313315' 'amazing']
 ['0.12574471111931573' 'apartment']
 ['0.11707353600058802' 'really']
 ['0.1155448156481647' 'good']
 ['0.1109265840797459' 'sofia']
 ['0.10969006910071108' 'us']
 ['0.10472758925870844' 'helpful']
 ['0.10026204319036233' 'fast']
 ['0.09689367823234812' 'friendly']
 ['0.09213930106251347' 'city']
 ['0.08971085605549181' 'perfect']
 ['0.08899437520461667' 'clean']
 ['0.08892863771868678' 'denitsa']
 ['0.08617087527741348' 'hosts']
 ['0.08310197581464841' 'well']
 ['0.0808784783146256' 'recommend']
 ['0.07967894137866842' 'host']
 ['0.0794925755789163' 'modern']
 ['0.0778685831754456' 'appartment']
 ['0.07722855528760937' 'time']
 ['0.07719920005375157' 'spacious']
 ['0.07647836600174399' 'nice']
 ['0.07430090724386092' 'would']
 ['0.07425815573093945' 'location']
 ['0.07405525501468205' 'wish']
 ['0.07387320935630949' 'could']
 ['0.07106845720279847' 'comfo

[['0.13792545025594297' 'great']
 ['0.12126854228426354' 'apartment']
 ['0.09981966463600515' 'location']
 ...
 ['0.0017079161341980863' 'zonnige']
 ['0.0017079161341980863' 'zou']
 ['0.0017079161341980863' 'zus']]
[['0.13437900618445467' 'great']
 ['0.12738522941709518' 'nice']
 ['0.1270801386387194' 'apartment']
 ...
 ['0.004178287193102173' 'tous']
 ['0.004178287193102173' 'volets']
 ['0.004178287193102173' 'être']]
[['0.14188296122661584' 'place']
 ['0.13843921474486587' 'great']
 ['0.12608844441619083' 'nice']
 ...
 ['0.007916417011204706' 'этаж']
 ['0.007916417011204706' 'эту']
 ['0.007916417011204706' 'языки']]
[['0.11243085755065167' 'us']
 ['0.11099474005738079' 'great']
 ['0.09201038555872149' 'nice']
 ...
 ['0.0015375923488958742' 'sud']
 ['0.0015375923488958742' 'talks']
 ['0.0015375923488958742' 'typique']]
[['0.1321773859332787' 'place']
 ['0.11980378901935775' 'nice']
 ['0.11884736301020701' 'perfect']
 ...
 ['0.0016868669669111412' 'usar']
 ['0.0016868669669111412' 'vec

[['0.18721873275001344' 'great']
 ['0.1834286730681121' 'location']
 ['0.1510185630203723' 'everything']
 ['0.14841298509572015' 'place']
 ['0.14390593387945744' 'clean']
 ['0.1360071195237854' 'apartment']
 ['0.13542408139266954' 'communication']
 ['0.13388627544599943' 'stay']
 ['0.12527937024016134' 'check']
 ['0.1199266236472414' 'good']
 ['0.11464200175499475' 'sofia']
 ['0.11339489678378144' 'super']
 ['0.10939335961034041' 'host']
 ['0.10834709142899675' 'ottimo']
 ['0.10823020704936565' 'described']
 ['0.1050890090430213' 'easy']
 ['0.10322987344341683' 'soggiorno']
 ['0.10322987344341683' 'top']
 ['0.10272407803441994' 'fast']
 ['0.09827969238523075' 'nice']
 ['0.09722559978806539' 'comfortable']
 ['0.09563323293238582' 'close']
 ['0.09403914772528894' 'recommend']
 ['0.09399450443376936' 'min']
 ['0.09291681823239514' 'well']
 ['0.08877762161525918' 'ok']
 ['0.0866614762030352' 'la']
 ['0.08562748585386647' 'cosy']
 ['0.08493193938662554' 'highly']
 ['0.08188903792158386' 'st

[['0.14383094372178212' 'great']
 ['0.11741500424107375' 'place']
 ['0.1114898167983631' 'apartment']
 ...
 ['0.0017293912778557732' 'zwei']
 ['0.0017293912778557732' 'zwischen']
 ['0.0017293912778557732' 'zügig']]
[['0.13148538783581773' 'great']
 ['0.11140220802035049' 'place']
 ['0.11004683597020783' 'location']
 ...
 ['0.002038245282222179' 'washing']
 ['0.002038245282222179' 'wi']
 ['0.002038245282222179' 'wind']]
[['0.17144964013347216' 'nice']
 ['0.16812599647644194' 'stay']
 ['0.1613534272176063' 'place']
 ['0.14946642289053963' 'great']
 ['0.1407283208582269' 'good']
 ['0.1367969601156409' 'location']
 ['0.1335054898641595' 'recommend']
 ['0.13168972496671055' 'everything']
 ['0.1289627124468612' 'clean']
 ['0.12027938448062758' 'apartment']
 ['0.11503025399045162' 'perfect']
 ['0.10900782194373078' 'need']
 ['0.09803798965376263' 'super']
 ['0.09787368494895876' 'alles']
 ['0.09692979953015175' 'sofia']
 ['0.09446767655046992' 'short']
 ['0.09441043877719489' 'visiting']
 ['0

[['0.11414666279590688' 'apartment']
 ['0.11281871233351054' 'nice']
 ['0.10224232948235559' 'great']
 ...
 ['0.0026582631192936905' 'vicini']
 ['0.0026582631192936905' 'vista']
 ['0.0026582631192936905' 'voto']]
[['0.11176229769451289' 'great']
 ['0.10989635962463974' 'nice']
 ['0.10984543400958802' 'apartment']
 ...
 ['0.0017792644504494746' 'zijn']
 ['0.0017792644504494746' 'zorg']
 ['0.0017792644504494746' 'zou']]
[['0.16331621972399246' 'nice']
 ['0.14358427098956997' 'great']
 ['0.12704146962001722' 'place']
 ...
 ['0.006686069563979295' 'verlässlich']
 ['0.006686069563979295' 'wohngegend']
 ['0.006686069563979295' 'zahlreiche']]
[['0.12605993824146636' 'great']
 ['0.12437236838498876' 'place']
 ['0.11688867957765361' 'perfect']
 ...
 ['0.004491313824169887' 'visite']
 ['0.004491313824169887' 'écoute']
 ['0.004491313824169887' 'équipement']]
[['0.14430188991650314' 'nice']
 ['0.13766575881383683' 'place']
 ['0.12408092547586064' 'great']
 ...
 ['0.00442682387207828' 'vrouw']
 ['0

[['0.15888487829713127' 'great']
 ['0.13756000078945166' 'place']
 ['0.12793786134060983' 'nice']
 ...
 ['0.0025799622892180483' 'want']
 ['0.0025799622892180483' 'winkelcentrum']
 ['0.0025799622892180483' 'zelf']]
[['0.13093049144195174' 'great']
 ['0.12006542475122639' 'apartment']
 ['0.11682979617420988' 'nice']
 ...
 ['0.009587557391401403' 'transports']
 ['0.009587557391401403' 'veut']
 ['0.009587557391401403' 'visiter']]
[['0.12311550365663214' 'great']
 ['0.10914625543401464' 'apartment']
 ['0.10835378038054572' 'nice']
 ...
 ['0.0024600429699360243' 'wardrobe']
 ['0.0024600429699360243' 'washing']
 ['0.0024600429699360243' 'windows']]
[['0.20456345773754925' 'great']
 ['0.17000289362781157' 'nice']
 ['0.15429101950547863' 'place']
 ['0.13022180156872795' 'sofia']
 ['0.12842178856332478' 'apartment']
 ['0.12284855089810182' 'host']
 ['0.12267574130514208' 'super']
 ['0.1146600345628558' 'stay']
 ['0.11216477699796817' 'like']
 ['0.10877723163320771' 'really']
 ['0.10488797620014

[['0.15084104774721913' 'great']
 ['0.13905925838511327' 'place']
 ['0.13597439762927185' 'location']
 ...
 ['0.0037012870722661147' 'wandering']
 ['0.0037012870722661147' 'without']
 ['0.0037012870722661147' 'zona']]
[['0.11919215777093176' 'place']
 ['0.11447465703172462' 'great']
 ['0.11202677859870584' 'perfect']
 ...
 ['0.0024844464065129786' 'tuvimos']
 ['0.0024844464065129786' 'zapatillas']
 ['0.0024844464065129786' 'único']]
[['0.13289213183364143' 'apartment']
 ['0.13250083529854134' 'good']
 ['0.12537841554513068' 'great']
 ...
 ['0.004994353039319349' 'trascurato']
 ['0.004994353039319349' 'trova']
 ['0.004994353039319349' 'tutto']]
[['0.15030238579042557' 'great']
 ['0.13049173235415512' 'nice']
 ['0.129523434813551' 'place']
 ...
 ['0.0033708252581805733' 'trouve']
 ['0.0033708252581805733' 'work']
 ['0.0033708252581805733' 'écoute']]
[['0.1353527854466554' 'great']
 ['0.12629173140633118' 'apartment']
 ['0.12129769677871055' 'perfect']
 ...
 ['0.00429489294515558' 'tranqu

[['0.12841944504395092' 'great']
 ['0.11084374269608498' 'host']
 ['0.1089925964261933' 'apartment']
 ...
 ['0.0018293606621827172' 'wij']
 ['0.0018293606621827172' 'zo']
 ['0.0018293606621827172' 'zou']]
[['0.1290337514907954' 'great']
 ['0.12315558385609636' 'place']
 ['0.10563929940353961' 'apartment']
 ...
 ['0.0029423885708324384' 'world']
 ['0.0029423885708324384' 'worries']
 ['0.0029423885708324384' 'years']]
[['0.15537496671985365' 'great']
 ['0.1216425501705687' 'us']
 ['0.1215523802944966' 'place']
 ...
 ['0.005483625085483043' 'чтобы']
 ['0.005483625085483043' 'эти']
 ['0.005483625085483043' 'этой']]
[['0.18987563805024785' 'great']
 ['0.1109319485269104' 'sofia']
 ['0.10595993859731238' 'everything']
 ['0.1039477900710437' 'nice']
 ['0.10390539133880805' 'place']
 ['0.10332598081987523' 'stay']
 ['0.09936248337219467' 'miroslav']
 ['0.09865124665855234' 'clean']
 ['0.09638203155932078' 'recommend']
 ['0.09637388519694683' 'location']
 ['0.09320135638651011' 'miro']
 ['0.092

[['0.20823015101741352' 'wonderful']
 ['0.20470436389869537' 'nice']
 ['0.18217135273492507' 'close']
 ['0.16936848069794944' 'also']
 ['0.15845229472088196' 'place']
 ['0.15509858036486318' 'apartment']
 ['0.14637702314949022' 'good']
 ['0.1402800885913909' 'center']
 ['0.1367563556589665' 'location']
 ['0.11778347004356772' 'cheap']
 ['0.11778347004356772' 'really']
 ['0.11422288958370178' 'da']
 ['0.10303174802890358' 'la']
 ['0.09890004577717668' 'er']
 ['0.09588995967996047' 'better']
 ['0.09588995967996047' 'cockroaches']
 ['0.09588995967996047' 'fitted']
 ['0.09588995967996047' 'sheets']
 ['0.09588995967996047' 'would']
 ['0.08973008082901983' 'ambience']
 ['0.08973008082901983' 'excellent']
 ['0.08973008082901983' 'owners']
 ['0.08973008082901983' 'quite']
 ['0.08862677930216446' 'helpful']
 ['0.08816975759126032' 'host']
 ['0.08281885457823357' 'clean']
 ['0.08281885457823357' 'snejana']
 ['0.08143342645298488' 'bella']
 ['0.08143342645298488' 'città']
 ['0.08143342645298488' 

 ['0.014198904458264026' 'workout']]
[['0.1314599614847866' 'perfect']
 ['0.11499598851923636' 'place']
 ['0.11406213305650707' 'everything']
 ...
 ['0.005643502649204951' 'trovati']
 ['0.005643502649204951' 'tutto']
 ['0.005643502649204951' 'via']]
[['0.12392806189831615' 'great']
 ['0.11036514492669634' 'location']
 ['0.10936020667900252' 'perfect']
 ...
 ['0.0014194445604202384' 'zwei']
 ['0.0014194445604202384' 'öffentlichen']
 ['0.0014194445604202384' 'über']]
[['0.16502396501272887' 'good']
 ['0.1569418538417871' 'great']
 ['0.14894705918931941' 'nice']
 ['0.14552032850571972' 'perfect']
 ['0.14274407807110912' 'location']
 ['0.1401913918714607' 'place']
 ['0.13219746999827542' 'near']
 ['0.12270869449109256' 'stay']
 ['0.11956622620365243' 'apartment']
 ['0.117988163957624' 'would']
 ['0.11648484671613996' 'host']
 ['0.11228524617716742' 'city']
 ['0.10937573872077505' 'really']
 ['0.10649226758116297' 'center']
 ['0.10636787397196233' 'recommend']
 ['0.10130916566266027' 'simeo

[['0.13862993155758685' 'great']
 ['0.12535286540799306' 'place']
 ['0.11532096645826356' 'nice']
 ...
 ['0.005126755919190383' 'ático']
 ['0.005126755919190383' 'último']
 ['0.005126755919190383' 'única']]
[['0.12107297947143121' 'good']
 ['0.11933533651064061' 'great']
 ['0.11144451564982405' 'nice']
 ...
 ['0.002954418586474418' 'wenig']
 ['0.002954418586474418' 'zahlreichen']
 ['0.002954418586474418' 'zu']]
[['0.13608329597331828' 'great']
 ['0.11671988174273297' 'place']
 ['0.10387349738898981' 'nikolay']
 ...
 ['0.006330096161497889' 'tuvimos']
 ['0.006330096161497889' 'ubicación']
 ['0.006330096161497889' 'una']]
[['0.14609963604624723' 'place']
 ['0.14154005601876865' 'flat']
 ['0.13216764747039797' 'nice']
 ['0.12819761578465164' 'really']
 ['0.12777516746312817' 'great']
 ['0.12726203021835739' 'location']
 ['0.1259851355878214' 'wonderful']
 ['0.12269665759516085' 'host']
 ['0.12100797050883422' 'sofia']
 ['0.11551823540545188' 'stay']
 ['0.11375883286659429' 'clean']
 ['0.1

[['0.32584632682464676' '地理位置超级好']
 ['0.32584632682464676' '很不错']
 ['0.32584632682464676' '没有更好更方便的民宿了']
 ['0.2320158545730594' 'place']
 ['0.18974223040976407' 'us']
 ['0.15259019484783473' 'sofia']
 ['0.1373197346936457' 'apartment']
 ['0.1373197346936457' 'better']
 ['0.1373197346936457' 'clean']
 ['0.1373197346936457' 'everyone']
 ['0.1373197346936457' 'great']
 ['0.1373197346936457' 'heart']
 ['0.1373197346936457' 'hotel']
 ['0.1373197346936457' 'much']
 ['0.1373197346936457' 'newly']
 ['0.1373197346936457' 'recommend']
 ['0.1373197346936457' 'renovated']
 ['0.1373197346936457' 'situated']
 ['0.1373197346936457' 'value']
 ['0.1373197346936457' 'would']
 ['0.12649482027317605' 'also']
 ['0.12649482027317605' 'check']
 ['0.12649482027317605' 'enough']
 ['0.12649482027317605' 'experienced']
 ['0.12649482027317605' 'space']
 ['0.12649482027317605' 'time']
 ['0.06324741013658802' 'although']
 ['0.06324741013658802' 'anywhere']
 ['0.06324741013658802' 'architect']
 ['0.06324741013658802

[['0.1584314828409729' 'nice']
 ['0.12925998570778932' 'place']
 ['0.122255186397738' 'apartment']
 ...
 ['0.006416484592645375' '훨씬']
 ['0.006416484592645375' '흔들리며']
 ['0.006416484592645375' '흥겨운']]
[['0.1727901728850957' 'great']
 ['0.13156831752394854' 'room']
 ['0.12523005966624162' 'us']
 ['0.11711918960358914' 'host']
 ['0.11414410353985736' 'sofia']
 ['0.11353163494100192' 'perfect']
 ['0.112300963721488' 'place']
 ['0.10713579009987975' 'krasen']
 ['0.10488731571671112' 'time']
 ['0.10454294482036591' 'best']
 ['0.10433739291146006' 'nice']
 ['0.10142369066986436' 'like']
 ['0.10117204668753042' 'home']
 ['0.09958275075041577' 'private']
 ['0.09854719132926776' 'apartment']
 ['0.08852586531353913' 'view']
 ['0.08678665192412398' 'stay']
 ['0.08636188662121191' 'fantastic']
 ['0.08209062703264773' 'absolutely']
 ['0.08209062703264773' 'days']
 ['0.08065315765953036' 'available']
 ['0.07956655230827964' 'would']
 ['0.07843631361550778' 'take']
 ['0.077550528857694' 'airport']
 [

[['0.14840464435001618' 'great']
 ['0.1346547047279282' 'place']
 ['0.12471998915856394' 'nice']
 ...
 ['0.003765366320438392' 'vehicle']
 ['0.003765366320438392' 'verify']
 ['0.003765366320438392' 'über']]
[['0.13434464420452652' 'great']
 ['0.11197006933211058' 'place']
 ['0.10742675742023505' 'apartment']
 ...
 ['0.0036082289557042126' 'чистые']
 ['0.0036082289557042126' 'эти']
 ['0.0036082289557042126' 'этом']]
[['0.1656412856695751' 'place']
 ['0.14790622771573034' 'great']
 ['0.1256624647082962' 'stay']
 ...
 ['0.004831245615587689' 'unos']
 ['0.004831245615587689' 'usarse']
 ['0.004831245615587689' 'él']]
[['0.15358769151173862' 'great']
 ['0.1319592379144978' 'nice']
 ['0.13152937595051165' 'apartment']
 ...
 ['0.0028941631817634094' 'voor']
 ['0.0028941631817634094' 'voorzieningen']
 ['0.0028941631817634094' 'zijn']]
[['0.13220729928525723' 'apartment']
 ['0.11717877130116393' 'great']
 ['0.11181189800080424' 'place']
 ...
 ['0.0075868561476424914' 'warm']
 ['0.007586856147642

JSONDecodeError: Unterminated string starting at: line 1 column 34667 (char 34666)