# Create Tags using TF-IDF

In [16]:
import mysql.connector
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

db_connection = mysql.connector.connect(
    host="localhost",
    user="root",
    password="vfgetew2234*Wew",
    database="ecommerce",
    port='3307'
)

cursor = db_connection.cursor(dictionary=True)
cursor.execute("SELECT Product.id, Product.name, Category.name as category, Product.description FROM Product INNER JOIN Category ON Category.id=Product.CategoryId")
products = cursor.fetchall()

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in string.punctuation and token not in stop_words]
    return " ".join(tokens)

tags_per_product = defaultdict(list)
all_texts = []
for product in products:
    name = preprocess_text(product['name'])
    category = preprocess_text(product['category'])
    description = preprocess_text(product['description'])
    text = name + " " + category + " " + description
    all_texts.append(text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_texts)

# Extract keywords for each product
feature_names = vectorizer.get_feature_names_out()
for i, product in enumerate(products):
    feature_index = tfidf_matrix[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
    top_keywords = sorted([(feature_names[i], score) for (i, score) in tfidf_scores], key=lambda x: x[1], reverse=True)[:5]
    tags_per_product[product['id']] = [keyword for keyword, _ in top_keywords]

for product_id, product_tags in tags_per_product.items():
    tags = ", ".join(product_tags)
    try:
        cursor.execute("UPDATE Product SET tags = %s WHERE id = %s", (tags, product_id))
        db_connection.commit()
    except:
        print('invalid')

cursor.close()
db_connection.close()


[nltk_data] Downloading package punkt to /home/tek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/tek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/tek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ProgrammingError: 1064 (42000): You have an error in your SQL syntax; check the manual that corresponds to your MariaDB server version for the right syntax to use near '=307430' at line 1

## Create ngram-tags 

In [1]:
import mysql.connector
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import defaultdict
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

db_connection = mysql.connector.connect(
    host="localhost",
    user="root",
    password="vfgetew2234*Wew",
    database="ecommerce",
    port='3307'
)

cursor = db_connection.cursor(dictionary=True)
cursor.execute("SELECT Product.id, Product.name, Category.name as category, Product.description FROM Product INNER JOIN Category ON Category.id=Product.CategoryId  LIMIT 1000000 offset 307430")
products = cursor.fetchall()

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in string.punctuation and token not in stop_words]
    return " ".join(tokens)

tags_per_product = defaultdict(list)

for product in products:
    text = preprocess_text(product['name'] + " " + product['category'] + " " + product['description'])
    
    # Tokenize the text using n-grams (bi-grams and tri-grams)
    count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(2, 3))
    count_data = count_vectorizer.fit_transform([text])

    # Apply LDA
    lda_model = LatentDirichletAllocation(n_components=1, random_state=42)
    lda_model.fit(count_data)

    # Extract top phrases from topics
    def get_top_phrases(model, feature_names, n_top_phrases):
        topic_phrases = []
        for topic_idx, topic in enumerate(model.components_):
            topic_phrases.append([feature_names[i] for i in topic.argsort()[:-n_top_phrases - 1:-1]])
        return topic_phrases

    # Get feature names (phrases) from CountVectorizer's vocabulary
    feature_names = count_vectorizer.get_feature_names_out()

    # Get top phrases for the single topic
    top_phrases = get_top_phrases(lda_model, feature_names, n_top_phrases=5)
    product_tags = top_phrases[0]

    # Update tags_per_product dictionary
    tags_per_product[product['id']] = product_tags

# Update tags field for each product
for product_id, product_tags in tags_per_product.items():
    tags = ", ".join(product_tags)
    try:
        cursor.execute("UPDATE Product SET tags = %s WHERE id = %s", (tags, product_id))
        db_connection.commit()
    except:
        print('invalid')

cursor.close()
db_connection.close()


[nltk_data] Downloading package punkt to /home/tek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/tek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/tek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


invalid
invalid
invalid
invalid
invalid
