In [144]:
!pip install pandas
!pip install numpy
!pip install contractions

import pandas as pd
import numpy as np
import json



# Pre-processing (Query generation)

In [147]:
FILE_PATH = "Datasets/train/train_Input_UserBased_PNC.json"
df = pd.read_json(FILE_PATH)

In [148]:
articles = pd.DataFrame(df['input'].apply(lambda x: x.split("article:", 1)[-1].strip() if "article:" in x else None))

In [149]:
articles['input'].iloc[32]

'Back in August, The Denver Post’s John Ingold scrutinized statements from conservatives that Colorado should free up money'

In [150]:
!pip install nltk
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import contractions
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tusharbudhwani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tusharbudhwani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tusharbudhwani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [151]:
def expand_contractions(text):
    
    # Handle special case for U.S.
    text = text.replace('U.S.', 'United States')
    
    
    expanded_text = contractions.fix(text)
    return expanded_text

In [152]:
articles['expanded_contractions'] = articles['input'].apply(lambda x: expand_contractions(x))

In [153]:
articles['lower_cased'] = articles['expanded_contractions'].str.lower()
articles['lower_cased'].head(5)

0    it is hard to find a restaurant that does not ...
1    as duchess catherine continues to peck away at...
2    today, vanity fair national political editor t...
3    on monday, the new york times published an exh...
4    by julie miller, vanity fair inspired by the m...
Name: lower_cased, dtype: object

In [154]:
def remove_stopwords(text):
    # Tokenize the text
    words = word_tokenize(text)

    # Get English stop words
    stop_words = set(stopwords.words('english'))

    # Remove stop words from the tokenized words
    filtered_words = [word for word in words if word.lower() not in stop_words]

    # Join the filtered words to form the final text
    filtered_text = ' '.join(filtered_words)

    return filtered_text

# Example usage
articles['removed_stop_words'] = articles['lower_cased'].apply(lambda x: remove_stopwords(x))
print("Text after Stopword Removal:\n", articles['removed_stop_words'].head(5))

Text after Stopword Removal:
 0    hard find restaurant place little card table i...
1    duchess catherine continues peck away furnishi...
2    today , vanity fair national political editor ...
3    monday , new york times published exhaustive o...
4    julie miller , vanity fair inspired magazine '...
Name: removed_stop_words, dtype: object


In [155]:
# Use re.sub() to remove parentheses
def remove_extra_characters(text):
  # Replace parentheses with an empty string
  return re.sub(r'\(|\)|\;|\:|\.|\,|\-|\~|\`', '', text)

In [156]:
articles['removed_extra_characters'] = articles['removed_stop_words'].apply(lambda x: remove_extra_characters(x))

In [157]:
articles['removed_extra_characters'].iloc[0]

'hard find restaurant place little card table inquiring establishment    really awful   b  tolerable   c  sublime '

In [158]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [159]:
import spacy

nlp = spacy.load('en_core_web_sm')

# Define a function to lemmatize text using spaCy
def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

In [160]:
articles['lemmatized'] = articles['removed_extra_characters'].apply(lambda x: lemmatize_text(x))

In [161]:
print(articles['removed_extra_characters'].iloc[0])
print(articles['lemmatized'].iloc[0])

hard find restaurant place little card table inquiring establishment    really awful   b  tolerable   c  sublime 
hard find restaurant place little card table inquire establishment     really awful    b   tolerable    c   sublime


In [162]:
def get_synonyms(word, max_synonyms=1):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.name() != word:  # Exclude the original word
                synonyms.add(lemma.name())
            if len(synonyms) >= max_synonyms:
                break
        if len(synonyms) >= max_synonyms:
            break
    return list(synonyms) 

def expand_text_with_synonyms(text):
    words = nltk.word_tokenize(text)
    expanded_text = []

    for word in words:
        synonyms = get_synonyms(word, max_synonyms=1)
        if synonyms:
            expanded_text.extend(synonyms)
        else:
            expanded_text.append(word)

    return ' '.join(expanded_text)


In [163]:
articles['synonyms'] = articles['lemmatized'].apply(lambda x: expand_text_with_synonyms(x))

In [164]:
print(articles['lemmatized'].iloc[25])

ever get enough prince harry ? answer   rhetorical question   back another roundup


In [165]:
articles['appended_synonyms'] = articles['lemmatized'] + articles['synonyms']

In [166]:
articles['appended_synonyms'].iloc[1]

'duchess catherine continue peck away furnish london apartment country house   former estimateduchess Catherine_of_Aragon go_on batch outside supply London flat state firm erstwhile estimation'

In [167]:
!pip install rank_bm25
from rank_bm25 import BM25Okapi



In [168]:
corpus = df['profile'].apply(lambda x: [item['text'] for item in x]).tolist()

In [169]:
corpus[:1]

[['The three make a trip of atypical opera themes, but no new opera brought the Met as much controversy as Klinghoffer.',
  "Though I might not subscribe to every prayer in the Siddur, I always use the synagogue time for my own prayer of thanks for being alive and the multitudes of blessings I enjoy.  I want to let the Lord know I haven't forgotten them.  Then, leaving the hall, the yarmulke still in place on my head, I head home feeling a little purer.",
  'I expected him to do well and show me some of his work. The money could be renewed for the second semester and the next year of college. Though I imagined that Mel would want to maintain periodic contact -- at least it was what I hoped -- I was wrong.',
  'Although my mother swept away any feeling for her native land, I saw my trips as partly for her, maybe an effort to reconnect her to a land that only I wanted her to reconnect with.']]

In [170]:
tokenized_corpus = [[word_tokenize(sentence) for sentence in text_list] for text_list in corpus]

In [171]:
bm25_instances = []
for sublist in tokenized_corpus:
    bm25_instance = BM25Okapi(sublist)
    
    # Append the BM25 instance to the list
    bm25_instances.append(bm25_instance)
# bm25_instances['bm25_instance'] = tokenized_corpus_df['Tokenized_Sentences'].apply(lambda x: BM25Okapi(x))

In [172]:
bm25_instances[:1]

[<rank_bm25.BM25Okapi at 0x38eeee6d0>]

In [173]:
# query = articles['removed_extra_characters'].head(1).to_string()
query = articles['appended_synonyms'].to_string()
# query = articles['lemmatized'].to_string()

In [174]:
query_instances = pd.DataFrame()

In [175]:
query_instances['query'] = articles['appended_synonyms'].apply(lambda x: x)
# query_instances['query'] = articles['lemmatized'].apply(lambda x: x)
query_instances.head(1)

Unnamed: 0,query
0,hard find restaurant place little card table i...


In [176]:
tokenized_query = pd.DataFrame()
tokenized_query['query'] = query_instances['query'].apply(lambda x: x.split())
tokenized_query.head(1)

Unnamed: 0,query
0,"[hard, find, restaurant, place, little, card, ..."


In [177]:
tokenized_query_list = tokenized_query.values.tolist()
tokenized_query_list = sum(tokenized_query_list, [])

tokenized_query_list[:1]

# BM25 Retrieval

In [179]:
doc_scores = []
for query, bm25_instance in zip(tokenized_query_list, bm25_instances):
    doc_score = bm25_instance.get_scores(query)
#     print(query)
#     print(bm25_instance)
    # Append the scores to the list
    doc_scores.append(doc_score)
# doc_scores['doc_scores'] = bm25_instances.apply(lambda x: x.split())

doc_scores[19:20]

[array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  4.12883893,  0.        ,  3.31944997,
         2.17551694,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  3.90694033,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  2.29037931,  0.        ,  0.        ,
         0.        ,  2.17551694,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        , 12.5554493 ,  0.        ,
         0.        ,  0.        ,  3.90694033,  2.93542304,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         3.77180028,  0.        , 12.5554493 ,  5.2142467 ,  0.        ,
         0.        ,  0.        ,  3.47159626,  0.        ,  0.        ,
         5.72666892,  0.        ,  0.        ,  0.        ,  0.        ,
         2.10513534,  0.        ,  0.        ,  0.        ,  0.        ,
        12.5554493 ,  0.        ,  0.        ,  0. 

In [180]:
print(articles['input'].iloc[32])
scoree = bm25_instances[32].get_scores(articles['input'].iloc[32].split())
print(scoree)
df['profile'].iloc[32]

Back in August, The Denver Post’s John Ingold scrutinized statements from conservatives that Colorado should free up money
[0.         2.88926974 2.48929956 0.         5.09518533 0.50976597
 0.         2.89900495 0.50976597 0.         0.         0.
 0.45113083 0.87634777 3.64471508 0.4786594  0.         0.
 0.         2.47059855 1.47501704 0.45113083 5.59372966 1.38314084
 0.95396388 2.95523485 3.93545013 0.70046391 0.87634777 0.97556476
 1.41744777 3.99048727]


[{'text': 'Sounding much like Trump, who last week called the Republican tax bill\xa0“one of the great Christmas gifts to middle-income',
  'title': "Echoing Trump, Gardner Says Passage Of Tax Bill Will Be A 'Great Christmas Celebration Across The Country'",
  'category': 'politics',
  'id': '10320'},
 {'text': 'U.S. Sen. Cory Gardner (R-CO) has been taking it pretty hard from all Coloradans, judging from his low approval ratings, and',
  'title': "Conservative Group Gives Gardner 'Enema Of The State' Award",
  'category': 'politics',
  'id': '10321'},
 {'text': 'Colorado officials and pro-abortion advocates are ecstatic over new statistics showing that the teen abortion rate has dropped',
  'title': "Anti-Choice Leaders In Colorado Still Angry About Program That's Dropped Teen Abortion Rate By 64 Percent",
  'category': 'politics',
  'id': '10322'},
 {'text': 'State Sen. Kevin Lundberg (R-Berthoud), who’s running for state treasurer, is touting his endorsement by David Barton, who’s',

In [181]:
top_k_documents_list = []

for query, corp, bm25_instance in zip(tokenized_query_list, corpus, bm25_instances): 
    top_k_document = bm25_instance.get_top_n(query, corp, n=3)  
    top_k_documents_list.append(top_k_document)

In [182]:
    
top_k_documents_list[:1]

[["Though I might not subscribe to every prayer in the Siddur, I always use the synagogue time for my own prayer of thanks for being alive and the multitudes of blessings I enjoy.  I want to let the Lord know I haven't forgotten them.  Then, leaving the hall, the yarmulke still in place on my head, I head home feeling a little purer.",
  'Although my mother swept away any feeling for her native land, I saw my trips as partly for her, maybe an effort to reconnect her to a land that only I wanted her to reconnect with.',
  'I expected him to do well and show me some of his work. The money could be renewed for the second semester and the next year of college. Though I imagined that Mel would want to maintain periodic contact -- at least it was what I hoped -- I was wrong.']]

In [183]:
type(top_k_documents_list)

list

In [184]:
top_k_documents_list[1]

['(Apparently, Sharon Stone and Quincy Jones once presented two categories in a row and accidentally gave away the second envelope',
 'Today, Vanity Fair national political editor Todd S. Purdum discussed the "substance" and "words" and "point" of First Lady Michelle Obama\'s flawless speech at last night\'s Democratic National Convention. This post will not be covering any of that.',
 "In anticipation of the show's sixth season VF.com phoned Bryant earlier this week to discuss the many character transformations"]

In [185]:
top_k_documents_list[0]

["Though I might not subscribe to every prayer in the Siddur, I always use the synagogue time for my own prayer of thanks for being alive and the multitudes of blessings I enjoy.  I want to let the Lord know I haven't forgotten them.  Then, leaving the hall, the yarmulke still in place on my head, I head home feeling a little purer.",
 'Although my mother swept away any feeling for her native land, I saw my trips as partly for her, maybe an effort to reconnect her to a land that only I wanted her to reconnect with.',
 'I expected him to do well and show me some of his work. The money could be renewed for the second semester and the next year of college. Though I imagined that Mel would want to maintain periodic contact -- at least it was what I hoped -- I was wrong.']

In [186]:
df.loc[0,"input"] + " User Profile: " + ' '.join(top_k_documents_list[0])

"Which category does this article relate to among the following categories? Just answer with the category name without further explanation. categories: [women, religion, politics, style & beauty, entertainment, culture & arts, sports, science & technology, travel, business, crime, education, healthy living, parents, food & drink] article: It's hard to find a restaurant that doesn't now place a little card at your table inquiring if the establishment was: (a) really awful; (b) tolerable; (c) sublime. User Profile: Though I might not subscribe to every prayer in the Siddur, I always use the synagogue time for my own prayer of thanks for being alive and the multitudes of blessings I enjoy.  I want to let the Lord know I haven't forgotten them.  Then, leaving the hall, the yarmulke still in place on my head, I head home feeling a little purer. Although my mother swept away any feeling for her native land, I saw my trips as partly for her, maybe an effort to reconnect her to a land that onl

In [187]:
personalized_input_list = []
i = 0
for item in df['profile']:
    # Assuming each item is a list of dictionaries
    doc = ""
    for profile in item:
        text = profile['text']
        category = profile['category']
        if text in top_k_documents_list[i]:
            doc = doc + (f'The category for the article: "{text}" is "{category}", and ')
    new_doc = doc[:-6] + ". "
    new_doc = new_doc + df["input"].iloc[i]
    # If you only want to print this for the first dictionary in each list, uncomment the following line
    i+=1
    personalized_input_list.append(new_doc)
#     personalized_prompt.append(f'The category for the article: "{text}" is "{category}". {df['input']})

In [188]:
import json

data_list = []

for index,row in df.iterrows():
    data = {}
    data["id"] = str(row["id"])
    data["input"] = df.loc[index,"input"]
    data["top_3_user_documents"] = top_k_documents_list[index]
    
    data["personalized_input"] = df.loc[index,"input"] + " User Profile: " + ' '.join(top_k_documents_list[index])
#     data["personalized_input"] = personalized_input_list[index]
    
    data_list.append(data)


print(len(data_list), " items added in the data_list")

5914  items added in the data_list


In [189]:
# Convert the dictionary to JSON format
json_data = json.dumps(data_list, indent=2)

# Save the JSON data to a file in the local directory
json_file_path = "preprocessed/without_synonyms_personalized_validation_k3_Input_UserBased_PNC.json"
with open(json_file_path, "w") as json_file:
    json_file.write(json_data)