In [None]:
import numpy as np
import pandas as pd
import spacy
from spacy.lang.en import English
en = English()
nlp = spacy.load("en_core_web_sm")
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
en = English()
nlp = spacy.load("en_core_web_sm")

def simple_tokenizer(doc, model=en):
    parsed = model(doc)
    return([t.lower_ for t in parsed if (t.is_alpha)&(not t.like_url)&(not t.is_stop)])

def lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

def load_glove_embeddings(dolma):
    vectorindex = {}
    with open(dolma, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32') 
            vectorindex[word] = coefs # Taking the values corresponding to each word in Dolma .txt file and adding them to a dictionary
    return vectorindex

def listingvectorizer(listing, vectorindex, vector_size):
    word_vectors = []
    tokenized = simple_tokenizer(lemmatize(listing), model=en)
    for word in tokenized:
        if word in vectorindex:
            word_vectors.append(vectorindex[word])
    if word_vectors:
        return np.mean(word_vectors, axis=0) # Finds the "average" of all the word vectors in a given document
    else:
        return np.zeros(vector_size) # If somehow no recognizable words at all, return zero vector

def cosine_similarity(listing_x, listing_y):
    dot_product = np.dot(listing_x, listing_y)
    norm1 = np.linalg.norm(listing_x)
    norm2 = np.linalg.norm(listing_y)
    if norm1 == 0 or norm2 == 0: # If all words in a document are not recognized/do not exist
        return 'NA'
    return dot_product / (norm1 * norm2)

glove_embeddings = load_glove_embeddings('dolma_300_2024_1.2M.100_combined.txt')

In [11]:
listingsim = []
listvects = []

INPUT = 'craigslistnonas.csv'

df = pd.read_csv(INPUT)
df.replace(np.nan, pd.NA, inplace=True)

for row in df.iterrows():
    listing = row[1]['descr']
    listingvector = listingvectorizer(listing, glove_embeddings, vector_size=300) # Dim. of Dolma set to 300
    listvects.append(listingvector)

df['listingvector'] = listvects

# Cartesian Product

largecartesian = df.merge(df, how='cross')
largecartesian = largecartesian[largecartesian['post_id_x'] < largecartesian['post_id_y']]
largecartesian = largecartesian[largecartesian['descr_x'] != largecartesian['descr_y']]

for row in largecartesian.iterrows():
    vector1, vector2 = row[1]['listingvector_x'], row[1]['listingvector_y']
    listingsimilarity = cosine_similarity(vector1, vector2)
    listingsim.append(listingsimilarity)    

largecartesian.drop(columns=['listingvector_x','listingvector_y'], inplace=True)
largecartesian['similarity_score'] = listingsim  
largecartesian.to_csv('craigslistcartbaycos.csv', index=False)

In [None]:
cv = CountVectorizer(tokenizer=simple_tokenizer, ngram_range=(1, 3)) 
count_desc_vecs = cv.fit_transform(craigslist['descr'].apply(lambda x:str(x))).toarray()
desc_count = dict(zip(cv.get_feature_names_out(), count_desc_vecs.sum(axis=0)))

n = 50
print(sorted(desc_count.items(), key=lambda x: x[1], reverse=True)[:n])