In [1]:
from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

In [2]:
%pwd

'c:\\Users\\Tanvir Ahmed\\Desktop\\Gen-AI-Mastery-Projects\\End-to-End-Projects\\Source-Code-Analysis\\research'

In [3]:
!mkdir test_repo


In [4]:
repo_path = "test_repo/"

repo = Repo.clone_from("https://github.com/tanvircs/Sentiment-Analysis-of-COVID-19-Vaccination-on-Twitter", to_path=repo_path)

In [5]:
loader = GenericLoader.from_filesystem(repo_path,
                                        glob = "**/*",
                                       suffixes=[".py"],
                                       parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

In [6]:
documents = loader.load()

In [9]:
documents[0]

Document(page_content='def preprocess_word(word):\n    word = word.lower()\n    word = re.sub("\'", "", word)\n    word = word.strip(\'\\\'"?!,.():;\')\n    word = re.sub(r\'(.)\\1+\', r\'\\1\\1\', word)\n    word = re.sub("@[A-Za-z0-9_]+","", word)\n    word = re.sub("#[A-Za-z0-9_]+","", word)\n    word = re.sub(r\'http\\S+\', \'\', word)\n    word = re.sub("[^a-z0-9]"," ", word)\n    word = re.sub(\'\\[.*?\\]\',\' \', word)\n    word = re.sub(r\'(-|\\\')\', \'\', word)\n    return word', metadata={'source': 'test_repo\\python\\Knn_DC.py', 'content_type': 'functions_classes', 'language': <Language.PYTHON: 'python'>})

In [8]:
len(documents)

8

In [10]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(language = Language.PYTHON,
                                                             chunk_size = 500,
                                                             chunk_overlap = 20)

In [11]:
texts = documents_splitter.split_documents(documents)

In [12]:
texts

[Document(page_content='def preprocess_word(word):\n    word = word.lower()\n    word = re.sub("\'", "", word)\n    word = word.strip(\'\\\'"?!,.():;\')\n    word = re.sub(r\'(.)\\1+\', r\'\\1\\1\', word)\n    word = re.sub("@[A-Za-z0-9_]+","", word)\n    word = re.sub("#[A-Za-z0-9_]+","", word)\n    word = re.sub(r\'http\\S+\', \'\', word)\n    word = re.sub("[^a-z0-9]"," ", word)\n    word = re.sub(\'\\[.*?\\]\',\' \', word)\n    word = re.sub(r\'(-|\\\')\', \'\', word)\n    return word', metadata={'source': 'test_repo\\python\\Knn_DC.py', 'content_type': 'functions_classes', 'language': <Language.PYTHON: 'python'>}),
 Document(page_content="def handle_emojis(tweet):\n    # Smile -- :), : ), :-), (:, ( :, (-:, :')\n    tweet = re.sub(r'(:\\s?\\)|:-\\)|\\(\\s?:|\\(-:|:\\'\\))', ' EMO_POS ', tweet)\n    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D\n    tweet = re.sub(r'(:\\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)\n    # Love -- <3, :*\n    tweet = re.sub(r'(<3|:\\*)', ' EMO_POS ', tweet)\

In [13]:
len(texts)

102

In [14]:
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [15]:
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [16]:
embeddings=OpenAIEmbeddings(disallowed_special=())

In [17]:
vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory='./db')

In [18]:
vectordb.persist()

In [19]:
llm = ChatOpenAI(model_name="gpt-4")

In [20]:
memory = ConversationSummaryMemory(llm=llm, memory_key = "chat_history", return_messages=True)

In [21]:
qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k":8}), memory=memory)

In [29]:
question = "What is handle_emojis function in the code?"

In [30]:
result = qa(question)
print(result['answer'])

The `handle_emojis` function in the code is used to replace different types of emojis in a tweet with the string 'EMO_POS'. It handles different types of emojis such as smile, laugh, love, wink, and sad emojis. The function uses regular expressions to identify these emojis in the tweet and then replaces them with 'EMO_POS'.


In [31]:
question = "What is word_vector function in the code?"

In [32]:
result = qa(question)
print(result['answer'])

The `word_vector` function is used to generate a vector representation of a given list of tokens (words). This is done with the help of the Word2Vec model (`model_w2v`), which has been trained previously. 

The function works as follows:
1. It initializes an empty vector of a given size.
2. It then iterates over each word in the tokens. For each word, it tries to find the corresponding vector in the Word2Vec model.
3. If the word is in the model's vocabulary, it adds the word's vector to the initially created vector and increments a counter.
4. If the word is not in the model's vocabulary (KeyError), it simply skips this word.
5. After iterating over all words, if the counter is not zero (i.e., if there were words that were in the model's vocabulary), it divides the vector by the counter. This results in an average vector that represents all the words in the tokens.
6. This average vector is then returned. 

In summary, this function generates an average Word2Vec vector for a given lis