<a href="https://colab.research.google.com/github/summaiyamus/NLP-Tasks/blob/main/Search_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import os
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK resources (run once)
import nltk
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))

    # Tokenization and remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Stemming
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]

    return ' '.join(tokens)

In [12]:

def process_csv_files(folder_path):
    processed_texts = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)

            try:
                # Read the CSV file into a DataFrame
                data = pd.read_csv(file_path)

                # Drop rows with any NaN values
                data = data.dropna()

                # Drop columns with all NaN values
                data = data.dropna(axis=1, how='all')

                # Apply text preprocessing to all columns
                for col in data.columns:
                    data[col] = data[col].apply(preprocess_text)

                # Display the processed data
                print(data.head())

                # Append processed text to the list
                processed_texts.extend(data.values.flatten())

                # Save the processed data to a new CSV file (optional)
                processed_csv_path = os.path.join(folder_path, f"processed_{filename}")
                data.to_csv(processed_csv_path, index=False)
                print(f"Processed data saved to {processed_csv_path}")

            except pd.errors.ParserError as e:
                print(f"Error parsing the CSV file {file_path}: {e}")

    return processed_texts



In [None]:
if __name__ == '__main__':
    csv_folder_path = "/content/drive/MyDrive/DawnDataSet"
    processed_texts = process_csv_files(csv_folder_path)

In [None]:
from google.colab import drive
drive.mount('/content/drive')