In [33]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# Load the Excel file
file_path = '/content/drive/MyDrive/123456.xlsx'
df = pd.read_excel(file_path)
print(df.head())

def preprocessing(text):
    if isinstance(text, float):
        text = str(text)
    text = text.lower()
    text = re.sub(r"[^a-zA-Z]", " ", text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    stop_words.update(['claim', 'non'])
    tokens = [item for item in tokens if item not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]
    return ' '.join(lemmatized_tokens)

# Apply preprocessing to the text column
docs = df['text_en']
docs = docs.apply(lambda x: preprocessing(x))

# Define the 7 categories and their keywords
category_keywords = {
    'a': ['ai unit', 'artificial intelligence unit', 'neural network', 'deep learning'],
    'b': ['parallel processing unit', 'parallel unit', 'gpu', 'multiprocessor'],
    'c': ['acceleration unit', 'accelerator unit', 'hardware accelerator'],
    'd': ['computation unit', 'arithmetic unit', 'alu', 'fpu'],
    'e': ['non-volatile', 'nv memory', 'nonvolatile memory', 'nvram'],
    'f': ['low power', 'low-energy', 'power efficient', 'energy saving'],
    'g': ['utilization', 'application', 'usage', 'implementation']
}

# Function to categorize text based on keywords
def categorize_text(text):
    for category, keywords in category_keywords.items():
        if any(keyword in text for keyword in keywords):
            return category
    return 'e'

# Assign categories to documents based on keywords
df['category'] = docs.apply(lambda x: categorize_text(x))

# Save the categorized dataframe to a new Excel file
output_file_path = '/content/drive/MyDrive/categorized_patent0.xlsx'
df.to_excel(output_file_path, index=False)

output_file_path


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


   No.                                            text_en
0    1  1. An artificial intelligence system comprisin...
1   26  The application relates to a computing power e...
2   29  The invention provides an artificial intellige...
3   53  The disclosure provides a forward interpolatio...
4   55  The application discloses a remote model train...


'/content/drive/MyDrive/categorized_patent0.xlsx'