In [2]:
import pickle
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def read_qa_file(file_path):
    questions = []
    answers = []
    
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
        for i in range(0, len(lines), 2):
            questions.append(lines[i].strip())
            answers.append(lines[i+1].strip())
    
    return questions, answers

file_path = 'qa.txt'
questions, answers = read_qa_file(file_path)

# Vectorize the questions using TF-IDF
vectorizer = TfidfVectorizer()
question_vectors = vectorizer.fit_transform(questions).toarray()  # Convert sparse matrix to dense

# Create a FAISS index
dimension = question_vectors.shape[1]  # Number of features (terms)
index = faiss.IndexFlatL2(dimension)  # L2 distance (Euclidean distance) index

# Add the question vectors to the index
index.add(question_vectors.astype(np.float32))

# Save the TF-IDF model, FAISS index, and questions/answers
with open('tfidf_model.pkl', 'wb') as model_file:
    pickle.dump(vectorizer, model_file)

faiss.write_index(index, 'faiss_index.idx')

with open('qa_data.pkl', 'wb') as qa_file:
    pickle.dump({'questions': questions, 'answers': answers}, qa_file)