In [16]:
# Import required libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords if it's the first time running the script
nltk.download('stopwords')

# Define stopwords (common words that don't contribute much to meaning)
stop_words = set(stopwords.words('english'))

# Load the dataset
print("Step 1: Loading the dataset...\n")
df = pd.read_csv(r'C:\Users\15713\Desktop\Verizon\products.csv')  # Path to your file
print("Dataset loaded successfully! Here's a preview of the first few rows:\n")
print(df[['product_name', 'product_description']].head())  # Preview product name and description

# Function to clean the text (Removing HTML tags, special characters, stopwords, etc.)
def clean_text(text):
    # 1. Handle missing data: If the text is NaN, replace it with an empty string
    if pd.isnull(text):
        return ""
    
    # 2. Lowercase the text
    text = text.lower()
    
    # 3. Remove HTML tags (if any)
    text = re.sub(r'<.*?>', '', text)
    
    # 4. Remove special characters and numbers (keep only alphabets and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 5. Remove stopwords
    words = text.split()
    text = ' '.join([word for word in words if word not in stop_words])
    
    return text

# Step 2: Apply the clean_text function to the 'product_description' column
print("\nStep 2: Cleaning the product descriptions...\n")
df['cleaned_text'] = df['product_description'].apply(clean_text)

# Step 3: Create a new DataFrame with only the cleaned data
cleaned_df = df[['product_name', 'cleaned_text']]

# Step 4: Save the cleaned dataset to a new CSV file with clear headers
output_path = r'C:\Users\15713\Desktop\Verizon\cleaned_data.csv'

# Save cleaned data with a descriptive header in the first row
cleaned_df.to_csv(output_path, index=False, header=["Product Name", "Cleaned Description"])

print(f"Step 5: Cleaned data saved successfully to {output_path}.\n")

# Additional Debugging: Verify the content of the saved file
print("Step 6: Verifying saved data...\n")
saved_df = pd.read_csv(output_path)
print("Preview of saved cleaned data:\n")
print(saved_df.head())  # Show the first few rows of the saved file


Step 1: Loading the dataset...

Dataset loaded successfully! Here's a preview of the first few rows:

                product_name  \
0         Cloud Secure Agent   
1  MicroStrategy BI Platform   
2         Signature Services   
3                     VizLib   
4      PowerCenter Connector   

                                                                                                                                                                                                                                                                                                                                                                                                                                  product_description  
0                                                                                                                                                                                                                                          The Informatica Cloud Secure A

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\15713\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
