In [1]:
#Ref: Langchain in your pocket, Mehul Gupta
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders.csv_loader import CSVLoader
import numpy as np
import pandas as pd

In [2]:
import sys
import os
sys.path.append('/home/vino/api_keys')
from api_key import OPENAI_API_KEY, OWM_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [3]:
#### Recommendation System using RAG

In [4]:
# Define the number of users and unique items
num_users = 1000
num_items = 20

# Generate random user IDs and item IDs
user_ids = np.arange(1, num_users + 1)
item_ids = np.arange(1, num_items + 1)

In [5]:
# Create random interaction data
data = {
'user_id': np.random.choice(user_ids, size=num_users * 10),
'item_id': np.random.choice(item_ids, size=num_users * 10),
}

# Create a pandas DataFrame from the data
df = pd.DataFrame(data).drop_duplicates()

# Display the first few rows of the generated data
print(df.head())

   user_id  item_id
0      451       18
1      562        6
2      719       11
3      961       13
4      334       17


In [6]:
df = df.groupby(['user_id'])['item_id'].agg(list).reset_index() 
df['item_id'] = df['item_id'].transform(lambda x: [0 if y+1 not in x else y+1 for y in range(20)])
df.to_csv('./data/recommendation.csv',index=False)

loader = CSVLoader(file_path="./data/recommendation.csv")
data = loader.load()

In [7]:
text_splitter=CharacterTextSplitter(chunk_size=200,chunk_overlap=0)
texts = text_splitter.split_documents(data)
embeddings = OpenAIEmbeddings()
llm=OpenAI(model_name="gpt-3.5-turbo-instruct",temperature=0)
docsearch = Chroma.from_documents(texts, embeddings)

In [8]:
qa=RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",retriever= docsearch.as_retriever())

In [9]:
qa.run('Suggest 2 articles to user-id 78 using given data which it has not seen.\
Follow this approach 1: Find similar Users and 2: suggest new articles from similar users.\
Also give a reason for suggestion')

'\n1. User 878 has a similar item history to user 78, with both users having viewed items 0, 2, 3, and 19. Therefore, it would be beneficial for user 78 to check out the items that user 878 has viewed that they have not, such as items 4, 8, 11, and 12. This could potentially introduce user 78 to new articles that they may be interested in.\n\n2. User 93 has also viewed item 2, which is the most viewed item for user 78. Therefore, it would be beneficial for user 78 to check out the items that user 93 has viewed that they have not, such as items 7, 12, and 15. This could potentially introduce user 78 to new articles that they may be interested in, based on their interest in item 2.'

In [10]:
#### Vector dB

In [11]:
import chromadb
from chromadb.utils import embedding_functions

In [14]:
chroma_client = chromadb.Client()
#creating embedding function
st=embedding_functions.SentenceTransformerEmbeddingFunction(model_name= "all-MiniLM-L6-v2")
#creating a new collection
collection = chroma_client.create_collection(name="test")

In [15]:
#loading the text file and segmenting
with open('./data/dummy_data2.txt', 'r') as file: data = file.read().replace('\n','.').split('.')
#adding sentences to collection
collection.add(
  documents = data,
  embeddings = st(data),
  ids = ['id'+str(x) for x in range(len(data))]
)

In [16]:
#Querying the collection and retrieving 5 top results

results = collection.query(
    query_texts=["who is vinothkumar"],
    n_results=5

)

In [18]:
#results