<a href="https://colab.research.google.com/github/vendo495/projects/blob/main/E_commerce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the training dataset
train_data = pd.read_csv('/content/train_data.csv')

# Display the first few rows of the dataset
train_data.head()


Unnamed: 0,name,brand,categories,primaryCategories,reviews.date,reviews.text,reviews.title,sentiment
0,"All-New Fire HD 8 Tablet, 8"" HD Display, Wi-Fi...",Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Electronics,2016-12-26T00:00:00.000Z,Purchased on Black FridayPros - Great Price (e...,Powerful tablet,Positive
1,Amazon - Echo Plus w/ Built-In Hub - Silver,Amazon,"Amazon Echo,Smart Home,Networking,Home & Tools...","Electronics,Hardware",2018-01-17T00:00:00.000Z,I purchased two Amazon in Echo Plus and two do...,Amazon Echo Plus AWESOME,Positive
2,Amazon Echo Show Alexa-enabled Bluetooth Speak...,Amazon,"Amazon Echo,Virtual Assistant Speakers,Electro...","Electronics,Hardware",2017-12-20T00:00:00.000Z,Just an average Alexa option. Does show a few ...,Average,Neutral
3,"Fire HD 10 Tablet, 10.1 HD Display, Wi-Fi, 16 ...",Amazon,"eBook Readers,Fire Tablets,Electronics Feature...","Office Supplies,Electronics",2017-08-04T00:00:00.000Z,"very good product. Exactly what I wanted, and ...",Greattttttt,Positive
4,"Brand New Amazon Kindle Fire 16gb 7"" Ips Displ...",Amazon,"Computers/Tablets & Networking,Tablets & eBook...",Electronics,2017-01-23T00:00:00.000Z,This is the 3rd one I've purchased. I've bough...,Very durable!,Positive


In [None]:
# Extract one review from each sentiment class
positive_review = train_data[train_data['sentiment'] == 'Positive']['reviews.text'].iloc[0]
negative_review = train_data[train_data['sentiment'] == 'Negative']['reviews.text'].iloc[0]
neutral_review = train_data[train_data['sentiment'] == 'Neutral']['reviews.text'].iloc[0]

positive_review, negative_review, neutral_review


('Purchased on Black FridayPros - Great Price (even off sale)Very powerful and fast with quad core processors Amazing soundWell builtCons -Amazon ads, Amazon need this to subsidize the tablet and will remove the adds if you pay them $15.Inability to access other apps except the ones from Amazon. There is a way which I was able to accomplish to add the Google Play storeNet this is a great tablet for the money',
 'was cheap, can not run chrome stuff, returned to store.',
 'Just an average Alexa option. Does show a few things on screen but still limited.')

In [None]:
# Check the class count for each sentiment
class_counts = train_data['sentiment'].value_counts()

class_counts


Positive    3749
Neutral      158
Negative      93
Name: sentiment, dtype: int64

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a Tf-Idf vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the reviews.text column
X_tfidf = tfidf_vectorizer.fit_transform(train_data['reviews.text'])

X_tfidf.shape


(4000, 4633)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Initialize and train the Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_tfidf, train_data['sentiment'])

# Predict on the training data
train_predictions = clf.predict(X_tfidf)

# Evaluate the classifier's performance
report = classification_report(train_data['sentiment'], train_predictions, target_names=['Negative', 'Neutral', 'Positive'])

report


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n    Negative       0.00      0.00      0.00        93\n     Neutral       0.00      0.00      0.00       158\n    Positive       0.94      1.00      0.97      3749\n\n    accuracy                           0.94      4000\n   macro avg       0.31      0.33      0.32      4000\nweighted avg       0.88      0.94      0.91      4000\n'

# class imbalance


In [None]:
from sklearn.utils import resample

# Separate each sentiment class
positive_data = train_data[train_data['sentiment'] == 'Positive']
neutral_data = train_data[train_data['sentiment'] == 'Neutral']
negative_data = train_data[train_data['sentiment'] == 'Negative']

# Oversample the minority classes
neutral_oversampled = resample(neutral_data, replace=True, n_samples=len(positive_data), random_state=42)
negative_oversampled = resample(negative_data, replace=True, n_samples=len(positive_data), random_state=42)

# Combine the oversampled minority classes with the majority class
oversampled_data = pd.concat([positive_data, neutral_oversampled, negative_oversampled])

# Convert reviews to Tf-Idf scores again
X_oversampled_tfidf = tfidf_vectorizer.transform(oversampled_data['reviews.text'])
y_oversampled = oversampled_data['sentiment']

# Check the new class distribution after manual oversampling
y_oversampled.value_counts()


Positive    3749
Neutral     3749
Negative    3749
Name: sentiment, dtype: int64

In [None]:
# Train the Multinomial Naive Bayes classifier on the oversampled data
clf.fit(X_oversampled_tfidf, y_oversampled)

# Predict on the oversampled data
oversampled_predictions = clf.predict(X_oversampled_tfidf)

# Evaluate the classifier's performance on the oversampled data
oversampled_report = classification_report(y_oversampled, oversampled_predictions, target_names=['Negative', 'Neutral', 'Positive'])

oversampled_report


'              precision    recall  f1-score   support\n\n    Negative       0.98      1.00      0.99      3749\n     Neutral       0.95      0.99      0.97      3749\n    Positive       0.99      0.92      0.95      3749\n\n    accuracy                           0.97     11247\n   macro avg       0.97      0.97      0.97     11247\nweighted avg       0.97      0.97      0.97     11247\n'

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_oversampled_tfidf, y_oversampled)

# Predict on the oversampled data
rf_predictions = rf_clf.predict(X_oversampled_tfidf)

# Evaluate the classifier's performance on the oversampled data
rf_report = classification_report(y_oversampled, rf_predictions, target_names=['Negative', 'Neutral', 'Positive'])

rf_report


'              precision    recall  f1-score   support\n\n    Negative       1.00      1.00      1.00      3749\n     Neutral       1.00      1.00      1.00      3749\n    Positive       1.00      1.00      1.00      3749\n\n    accuracy                           1.00     11247\n   macro avg       1.00      1.00      1.00     11247\nweighted avg       1.00      1.00      1.00     11247\n'

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_clf.fit(X_reduced_oversampled_tfidf, y_reduced_oversampled)

# Predict on the subset of the oversampled data
gb_predictions = gb_clf.predict(X_reduced_oversampled_tfidf)

# Evaluate the classifier's performance on the subset of the oversampled data
gb_report = classification_report(y_reduced_oversampled, gb_predictions, target_names=['Negative', 'Neutral', 'Positive'])

gb_report


'              precision    recall  f1-score   support\n\n    Negative       0.98      1.00      0.99      1832\n     Neutral       0.91      0.97      0.94      1882\n    Positive       0.97      0.89      0.93      1910\n\n    accuracy                           0.95      5624\n   macro avg       0.95      0.95      0.95      5624\nweighted avg       0.95      0.95      0.95      5624\n'

In [None]:
from sklearn.svm import SVC

# Initialize and train the multi-class SVM
svm_clf = SVC(kernel='linear', decision_function_shape='ovr', random_state=42)
svm_clf.fit(X_reduced_oversampled_tfidf, y_reduced_oversampled)

# Predict on the subset of the oversampled data
svm_predictions = svm_clf.predict(X_reduced_oversampled_tfidf)

# Evaluate the classifier's performance on the subset of the oversampled data
svm_report = classification_report(y_reduced_oversampled, svm_predictions, target_names=['Negative', 'Neutral', 'Positive'])

svm_report


'              precision    recall  f1-score   support\n\n    Negative       1.00      1.00      1.00      1832\n     Neutral       0.99      1.00      0.99      1882\n    Positive       1.00      0.99      0.99      1910\n\n    accuracy                           0.99      5624\n   macro avg       0.99      0.99      0.99      5624\nweighted avg       0.99      0.99      0.99      5624\n'

In [None]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics import classification_report

In [None]:
# Separate each sentiment class
positive_data = train_data[train_data['sentiment'] == 'Positive']
neutral_data = train_data[train_data['sentiment'] == 'Neutral']
negative_data = train_data[train_data['sentiment'] == 'Negative']


In [None]:
# Oversample the minority classes
neutral_oversampled = resample(neutral_data, replace=True, n_samples=len(positive_data), random_state=42)
negative_oversampled = resample(negative_data, replace=True, n_samples=len(positive_data), random_state=42)


In [None]:
# Combine the oversampled minority classes with the majority class
oversampled_data = pd.concat([positive_data, neutral_oversampled, negative_oversampled])


In [None]:
# Reduce the feature space using fewer Tf-Idf features
reduced_tfidf_vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
X_reduced_tfidf = reduced_tfidf_vectorizer.fit_transform(oversampled_data['reviews.text'])
y_reduced = oversampled_data['sentiment']


In [None]:
# Split the data into training and holdout sets
X_train_blend, X_holdout_blend, y_train_blend, y_holdout_blend = train_test_split(
    X_reduced_tfidf, y_reduced, test_size=0.2, random_state=42)


In [None]:
# Train base models on the training set
rf_clf_blend = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf_blend.fit(X_train_blend, y_train_blend)


In [None]:
mnb_clf_blend = MultinomialNB()
mnb_clf_blend.fit(X_train_blend, y_train_blend)



In [None]:
# trained models to make predicted probabilities on the holdout set
rf_probabilities_blend = rf_clf_blend.predict_proba(X_holdout_blend)
mnb_probabilities_blend = mnb_clf_blend.predict_proba(X_holdout_blend)


In [None]:
# Stack probabilities to be used as features for the meta-model
stacked_probabilities = np.column_stack((rf_probabilities_blend, mnb_probabilities_blend))


In [None]:
# Train the meta-model (Logistic Regression) on the stacked probabilities
meta_model = LogisticRegression(max_iter=1000, random_state=42)
meta_model.fit(stacked_probabilities, y_holdout_blend)


In [None]:
# Predict on the holdout set using the meta-model
meta_predictions = meta_model.predict(stacked_probabilities)

# Evaluate the blending ensemble's performance
blending_report = classification_report(y_holdout_blend, meta_predictions, target_names=['Negative', 'Neutral', 'Positive'])
print(blending_report)

              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00       740
     Neutral       1.00      1.00      1.00       751
    Positive       1.00      1.00      1.00       759

    accuracy                           1.00      2250
   macro avg       1.00      1.00      1.00      2250
weighted avg       1.00      1.00      1.00      2250



In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Set parameters for tokenization and sequence padding
top_words = 5000p
max_review_length = 100

# Tokenize the text data
tokenizer = Tokenizer(num_words=top_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)
tokenizer.fit_on_texts(oversampled_data['reviews.text'])
X_sequences = tokenizer.texts_to_sequences(oversampled_data['reviews.text'])

# Pad sequences to ensure consistent length
X_padded = pad_sequences(X_sequences, maxlen=max_review_length)

# Convert sentiment labels to one-hot encoded format
sentiment_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
y_encoded = to_categorical(oversampled_data['sentiment'].map(sentiment_mapping))

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

# Define the LSTM model
embedding_length = 32

model_lstm = Sequential()
model_lstm.add(Embedding(top_words, embedding_length, input_length=max_review_length))
model_lstm.add(LSTM(100))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(3, activation='softmax'))
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model_lstm.summary()

# Train the model
model_lstm.fit(X_padded, y_encoded, epochs=10, batch_size=64, validation_split=0.2)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           160000    
                                                                 
 lstm (LSTM)                 (None, 100)               53200     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 3)                 303       
                                                                 
Total params: 213503 (834.00 KB)
Trainable params: 213503 (834.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7eaf5c1a2ce0>

In [None]:
from keras.layers import GRU

# Define the GRU model
model_gru = Sequential()
model_gru.add(Embedding(top_words, embedding_length, input_length=max_review_length))
model_gru.add(GRU(100))
model_gru.add(Dropout(0.5))
model_gru.add(Dense(3, activation='softmax'))
model_gru.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model_gru.summary()

# Train the model
model_gru.fit(X_padded, y_encoded, epochs=10, batch_size=64, validation_split=0.2)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 32)           160000    
                                                                 
 gru (GRU)                   (None, 100)               40200     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 3)                 303       
                                                                 
Total params: 200503 (783.21 KB)
Trainable params: 200503 (783.21 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7eaf4ba89090>

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Number of topics
n_topics = 5

# Create and fit the LDA model
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_result = lda.fit_transform(X_reduced_tfidf)

# Display the top words for each topic
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d" % (topic_idx)] = top_words
    return topic_dict

no_top_words = 10
lda_topics = display_topics(lda, reduced_tfidf_vectorizer.get_feature_names_out(), no_top_words)
lda_topics


In [None]:
from sklearn.cluster import KMeans

# Choose a number of clusters; we'll start with 5 to match our previous number of topics
n_clusters = 5

# Apply KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X_reduced_tfidf)

# Function to get top keywords for each cluster
def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()

    top_keywords = {}
    for i, r in df.iterrows():
        top_keywords["Cluster %d" % i] = [labels[t] for t in np.argsort(r)[-n_terms:]]

    return top_keywords

cluster_keywords = get_top_keywords(X_reduced_tfidf, clusters, reduced_tfidf_vectorizer.get_feature_names_out(), 10)
cluster_keywords




{'Cluster 0': ['useless',
  'shuts',
  'models',
  'speakers',
  'kindle',
  'week',
  'junk',
  'old',
  'year',
  'generation'],
 'Cluster 1': ['children',
  'returned',
  'hard',
  'like',
  'device',
  'good',
  'easy',
  'apps',
  'product',
  'use'],
 'Cluster 2': ['kid',
  'average',
  'just',
  'apple',
  'responsive',
  'information',
  'limited',
  'user',
  'program',
  'friendly'],
 'Cluster 3': ['like',
  'bought',
  'amazon',
  'buy',
  'just',
  'alexa',
  'screen',
  'love',
  'echo',
  'great'],
 'Cluster 4': ['disappointed',
  'curious',
  'kids',
  'apps',
  'price',
  'great',
  'games',
  'amazon',
  'good',
  'tablet']}

In [None]:
# Increase the number of topics for a more granular exploration
n_topics_detailed = 10

# Create and fit the LDA model
lda_detailed = LatentDirichletAllocation(n_components=n_topics_detailed, random_state=42, max_iter=15)
lda_result_detailed = lda_detailed.fit_transform(X_reduced_tfidf)

# Display the top words for each topic
no_top_words_detailed = 15
lda_topics_detailed = display_topics(lda_detailed, reduced_tfidf_vectorizer.get_feature_names_out(), no_top_words_detailed)
lda_topics_detailed


{'Topic 0': ['loves',
  'son',
  'bought',
  'alexa',
  'great',
  'gift',
  'tablet',
  'echo',
  'easy',
  'home',
  'love',
  'games',
  'product',
  'friendly',
  'just'],
 'Topic 1': ['junk',
  'week',
  'apps',
  'randomly',
  'closes',
  'use',
  'lots',
  'ads',
  'paper',
  'hard',
  'lasted',
  'used',
  'screen',
  'charge',
  'doesn'],
 'Topic 2': ['good',
  'reading',
  'tablet',
  'books',
  'dark',
  'download',
  'apps',
  'load',
  'use',
  'chrome',
  'run',
  'amazon',
  'store',
  'tries',
  'proper'],
 'Topic 3': ['generation',
  'product',
  'cause',
  '4th',
  'bad',
  '5th',
  'sound',
  'better',
  'like',
  'really',
  'buy',
  'kindle',
  'tablets',
  'apps',
  'great'],
 'Topic 4': ['echo',
  'great',
  'play',
  'love',
  'good',
  'plus',
  'better',
  'amazon',
  'like',
  'account',
  'movies',
  'device',
  'thing',
  'store',
  'use'],
 'Topic 5': ['christmas',
  'model',
  'card',
  'going',
  'kindle',
  'just',
  'sd',
  'fun',
  'models',
  'work',