In [7]:
import os
import io
import pickle
import pandas as pd
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request
from flask_sqlalchemy import SQLAlchemy



cresentials = 'drive_creds.json'
API_NAME = 'drive'
API_VERSION = 'v3'
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'your_database_uri'
db = SQLAlchemy(app)


class Document(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String(100), nullable=False)
    content = db.Column(db.Text, nullable=False)


STOPWORDS = set(stopwords.words('english'))


class GoogleDriveSearch:
    def __init__(self):
        self.service = self._authenticate()

    def _authenticate(self):
        from google.oauth2 import service_account

        creds = service_account.Credentials.from_service_account_file(cresentials, scopes=SCOPES)
        return build(API_NAME, API_VERSION, credentials=creds)

    def _download_document(self, file_id):
        request = self.service.files().get_media(fileId=file_id)
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)
        done = False

        while not done:
            status, done = downloader.next_chunk()

        fh.seek(0)
        return fh

    def download_documents(self, folder_id, output_path):
        results = self.service.files().list(q=f"'{folder_id}' in parents and mimeType!='application/vnd.google-apps.folder'",
                                            fields="files(id, name)").execute()
        files = results.get('files', [])

        if not files:
            print('No files found.')
            return

        os.makedirs(output_path, exist_ok=True)

        for file in files:
            print(f"Downloading {file['name']}...")
            file_id = file['id']
            fh = self._download_document(file_id)

            with open(os.path.join(output_path, file['name']), 'wb') as f:
                f.write(fh.read())

            print(f"Downloaded {file['name']} to {output_path}.")

    def _preprocess_text(self, text):
        sentences = sent_tokenize(text)
        preprocessed_sentences = []

        for sentence in sentences:
            words = word_tokenize(sentence.lower())
            words = [word for word in words if word.isalpha() and word not in STOPWORDS]
            preprocessed_sentences.append(' '.join(words))

        return ' '.join(preprocessed_sentences)

    def _load_documents(self, folder_path):
        documents = []

        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)

            if os.path.isfile(file_path):
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                    preprocessed_text = self._preprocess_text(text)
                    documents.append(preprocessed_text)

        return documents

    def build_index(self, folder_id, output_path):
        self.download_documents(folder_id, output_path)
        documents = self._load_documents(output_path)

        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(documents)

        with open('tfidf.pickle', 'wb') as f:
            pickle.dump((vectorizer, tfidf_matrix), f)

        print("Index built successfully.")

    def load_index(self):
        with open('tfidf.pickle', 'rb') as f:
            vectorizer, tfidf_matrix = pickle.load(f)

        return vectorizer, tfidf_matrix


def main():
    drive_search = GoogleDriveSearch()
    folder_id = '1X1Wd1D2mSnCP-llYQghglTrMDcnAIh1a'
    output_path = 'output'
    drive_search.build_index(folder_id, output_path)
    vectorizer, tfidf_matrix = drive_search.load_index()
    queries = [
        "How To Create an Amazon AWS Free Tier Account?",
        "what are components of s3?",
        "what is S3 replication?",
    ]
    for file in os.listdir(output_path):
        file_path = os.path.join(output_path, file)

        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            document = Document(name=file, content=content)
            db.session.add(document)

        db.session.commit()
    for query in queries:
        print(f"\nQuery: {query}")
        preprocessed_query = drive_search._preprocess_text(query)
        query_vector = vectorizer.transform([preprocessed_query])
        similarities = cosine_similarity(query_vector, tfidf_matrix)
        most_similar_index = similarities.argmax()
        document_name = os.listdir(output_path)[most_similar_index]
        print(f"Most relevant document: {document_name}")


if __name__ == '__main__':
    main()


No files found.


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [2]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [5]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to C:\Users\win
[nltk_data]     10\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True