# Text Mining

In [1]:
import pandas as pd
airbnb = pd.read_csv("listings-2.csv")

#### TF-IDF Feature Importance

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

text_columns = ["name", "description", "neighborhood_overview", "host_name", "host_about", "amenities"
]
#Initialize dictionary
top_terms_per_column = {}

# Loop through each column to apply TF-IDF
for col in text_columns:
   
    vectorizer = TfidfVectorizer(stop_words="english")

    tfidf_matrix = vectorizer.fit_transform(airbnb[col].fillna(''))
    
    feature_names = vectorizer.get_feature_names_out()
    sums = tfidf_matrix.sum(axis=0)
    
    data = [(term, sums[0, idx]) for idx, term in enumerate(feature_names)]
    ranking = pd.DataFrame(data, columns=['term', 'rank']).sort_values('rank', ascending=False)
    
    top_terms_per_column[col] = ranking.head()

for col, top_terms in top_terms_per_column.items():
    print(f"\nTop terms for column '{col}':\n", top_terms)


Top terms for column 'name':
          term         rank
6975    ocean  1425.045058
3036    beach  1395.925482
3091  bedroom  1139.954502
8916     view  1112.242715
3604    condo  1030.974647

Top terms for column 'description':
           term         rank
4922        br  2563.615370
4471     beach  1194.293806
12996    ocean  1002.673882
15043   resort   956.814199
4548   bedroom   926.362242

Top terms for column 'neighborhood_overview':
           term         rank
2016        br  1613.797167
1670     beach  1024.165479
7760     miles   772.773418
12675  waikiki   696.706960
7160   located   661.812596

Top terms for column 'host_name':
          term         rank
2288     maui  1048.838257
1206   hawaii   809.737802
2826  rentals   778.481619
3441   vacasa   758.949933
2833   resort   547.877894

Top terms for column 'host_about':
            term         rank
12137  vacation  1395.155847
7382       maui   978.541900
7044       love   872.832162
5260     hawaii   761.731695
821  

The most important terms in each of these columns seem in line with what we would expect from Hawaii vacation rentals. We can seem under host_name, that 'rentals' is ranked third, suggesting that many hosts are large rental companies instead of individual homeowners. Under amenities, 'u2013' stands out as we do not know what that is referring to.

#### Cosine Similarity

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd

text_columns = ["name", "description", "neighborhood_overview", "host_name", "host_about", "amenities"]
tfidf_matrices = {}
similarity_results = {}
sample_size = 500 

#Map rows to IDs
index_to_id = airbnb["id"].to_dict()

#TF-IDF matrices for each column
for col in text_columns:
    vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
    tfidf_matrices[col] = vectorizer.fit_transform(airbnb[col].fillna(''))

#Similarity calculations with sampling
for col in text_columns:
    tfidf_matrix = tfidf_matrices[col]
    
    
    if tfidf_matrix.shape[0] > sample_size:
        tfidf_sample = tfidf_matrix[:sample_size]
    else:
        tfidf_sample = tfidf_matrix
    
    
    nn = NearestNeighbors(n_neighbors=10, metric="cosine").fit(tfidf_sample)
    distances, indices = nn.kneighbors(tfidf_sample)

    #Exclude self-pairs, replace indices with IDs
    similarity_results[col] = [(index_to_id[i], index_to_id[indices[i][j]], 1 - distances[i][j]) 
                               for i in range(len(distances)) 
                               for j in range(1, len(indices[i])) 
                               if i != indices[i][j] and (1 - distances[i][j]) > 0.8]


    print(f"\nTop similar pairs in column '{col}':")
    for id1, id2, score in similarity_results[col][:10]:
        print(f"Entry {id1} and Entry {id2} have similarity score: {score}")



Top similar pairs in column 'name':
Entry 162600 and Entry 1371205 have similarity score: 0.837395166335527
Entry 162600 and Entry 1371229 have similarity score: 0.837395166335527
Entry 162600 and Entry 1365551 have similarity score: 0.837395166335527
Entry 162600 and Entry 1365599 have similarity score: 0.837395166335527
Entry 162600 and Entry 1339958 have similarity score: 0.837395166335527
Entry 162600 and Entry 1371261 have similarity score: 0.837395166335527
Entry 162600 and Entry 1371734 have similarity score: 0.8287365030934828
Entry 162600 and Entry 1371831 have similarity score: 0.8287365030934828
Entry 36789 and Entry 222203 have similarity score: 1.0
Entry 177939 and Entry 760196 have similarity score: 0.9344967608069812

Top similar pairs in column 'description':
Entry 5387 and Entry 1466642 have similarity score: 0.9536905941320121
Entry 13688 and Entry 543962 have similarity score: 0.9206445094386414
Entry 168205 and Entry 547635 have similarity score: 0.8579653471204687

In [6]:
#View listing descriptions for matched IDs example

airbnb[airbnb["id"].isin([806017561091754918, 930473613964233211])][["id","host_name","listing_url", "description"]]


Unnamed: 0,id,host_name,listing_url,description
23767,806017561091754918,Steven,https://www.airbnb.com/rooms/806017561091754918,Take it easy at this unique and tranquil getaway.
27256,930473613964233211,Joseph,https://www.airbnb.com/rooms/930473613964233211,Take it easy at this unique and tranquil getaway.


The most interesting column here is the description column because this is the column we expect to be the most unique and original per property. However, we have certain properties which had similarity scores of 1.0 and upon random selection of one such paid, we found what appeared to be identical properties listed by two different hosts. The even used the same license ID, which according to AirBnb is unique per host and property. This warrants further exploration.

#### N-gram analysis for word combinations

In [28]:
#N-gram analysis (sentence patterns)

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

#Store top n-grams
top_ngrams_per_column = {}

#Loop through each text column
for col in text_columns:
   
    vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')
    ngram_matrix = vectorizer.fit_transform(airbnb[col].fillna(''))
    ngram_counts = ngram_matrix.sum(axis=0)
    ngrams = [(ngram, ngram_counts[0, idx]) for ngram, idx in vectorizer.vocabulary_.items()]
    sorted_ngrams = sorted(ngrams, key=lambda x: x[1], reverse=True)
    
    top_ngrams_per_column[col] = sorted_ngrams[:10]

#Print the top n-grams for each column
for col, ngrams in top_ngrams_per_column.items():
    print(f"\nTop n-grams for column '{col}':")
    for ngram, count in ngrams:
        print(f"{ngram}: {count}")


Top n-grams for column 'name':
ocean view: 3034
ocean views: 1174
free parking: 829
beach club: 624
waikiki beach: 590
ko olina: 579
walk beach: 497
honua kai: 489
maui resort: 473
hot tub: 449

Top n-grams for column 'description':
br br: 14287
ocean views: 2996
waikiki beach: 2842
ocean view: 2839
living room: 2707
walking distance: 2112
washer dryer: 2104
bedroom bath: 1876
fully equipped: 1874
king bed: 1821

Top n-grams for column 'neighborhood_overview':
br br: 7603
miles br: 6458
beach park: 2282
walking distance: 2162
waikiki beach: 1712
ala moana: 1274
shopping center: 1213
park miles: 1194
min drive: 1169
minute drive: 1127

Top n-grams for column 'host_name':
resort rentals: 728
vacasa hawaii: 652
maui resort: 526
maui resort rentals: 526
maui condo: 400
cb island: 391
island vacations: 391
cb island vacations: 391
castle resorts: 391
resorts hotels: 391

Top n-grams for column 'host_about':
vacation rental: 5512
vacation rentals: 4479
real estate: 3174
big island: 2404
ren

This analyis is typically used to detect word combinations in spam. In our case, the word combinations are in line with what we would expect for each text column, like 'coffee' and 'maker' under amenities. 