Part 1: Text Processing (Lemmatization, Tokenization, Removing Stopwords & Punctuation, Generating Bigrams)

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Load the Excel file

df = pd.read_excel(file_path)

# Specify the column to process
column_name = 'Column name'

# Function to process text: convert to lowercase, remove punctuation, tokenize, remove stop words, and lemmatize
def process_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove punctuation
    words = [word for word in words if word.isalnum()]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words)

# Apply the function to the specified column
df['Processed_Text'] = df[column_name].apply(lambda x: process_text(str(x)))

# Function to get top N bigrams using CountVectorizer


def bigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]



# Get top 10 bigrams
out = bigrams(df['Processed_Text'], n=200)

#Check and verify the output
out

Part 2: Category Columns and Binary Assignment

In [None]:
import os  
# Define the keyword categories and their corresponding words
categories = {
    'x1': ['a1','a2','a3','a4'],
    'x2': ['a1','a2','a3','a4'],
    'x3': ['a1','a2','a3','a4'],
    'x4': ['a1','a2','a3','a4'],
    'x5': ['a1','a2','a3','a4'],
    'x6':['a1','a2','a3','a4'],
    'x7': ['a1','a2','a3','a4'],
    'x8': ['a1','a2','a3','a4'],
    'x9': ['a1','a2','a3','a4'],
    'x10': ['a1','a2','a3','a4']
}

# Initialize the new columns with 0

Multi-Hot Encoding

In [None]:
for category in categories:
    df[category] = 0

# Check each row in Processed_Text and update the new columns
def update_category_columns(row, categories):
    for category, keywords in categories.items():
        # If any keyword is found in the Processed_Text, set the column to 1
        if any(word in row for word in keywords):
            return 1
    return 0

for category, keywords in categories.items():
    df[category] = df['Processed_Text'].apply(lambda x: update_category_columns(x, {category: keywords}))

# Save the updated DataFrame to an Excel file
save_path = os.path.join(os.path.dirname(file_path), 'processed_file_with_categories.xlsx')
df.to_excel(save_path, index=False)

print("Process completed")