![img](https://www.innomatics.in/wp-content/uploads/2020/04/innomatics-research-labs-logo-squared.png) 

<center> <h1 style='color:blue;'>Innomatics Research Labs January 2024 Internship </h1> </center>

# Name: Mohammad Wasiq

## Intern ID: IN1240273

## E-mail: gl0427@myamu.ac.in

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [105]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [106]:
df = pd.read_csv('/content/drive/MyDrive/Innomatics Research lab/Sentiment Analysis of Real-time Flipkart Product Reviews/cleaned_data.csv')

In [107]:
df.dropna(inplace=True)

In [108]:
X= df['cleaned_review_text']
y = df['sentiment']

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [110]:
tfidf_vectorizer = TfidfVectorizer()

In [111]:
print(X_train)

8284                      perfect intermediate playersread
22877                                 superb fresh article
5533                                      nice shulterread
2517                                              goodread
14978                                     good productread
                               ...                        
21871                                                 best
5395     multiple small small hole cork doesnt look lik...
865                                       nice shuttleread
16091    great test great quality great price point tim...
23952                                                 nice
Name: cleaned_review_text, Length: 22766, dtype: object


In [112]:
tfidf_vectorizer.fit(X_train)

In [113]:
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_test)

In [114]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [115]:
rf_model.fit(X_train_tfidf, y_train)

In [116]:
rf_pred = rf_model.predict(X_val_tfidf)

In [117]:
accuracy = accuracy_score(y_test, rf_pred)
print("Accuracy:", accuracy)

# Calculate F1-Score
f1 = f1_score(y_test, rf_pred, pos_label='Positive')
print("F1-Score:", f1)

# Print classification report
print(classification_report(y_test, rf_pred))

Accuracy: 0.890548137737175
F1-Score: 0.9376313945339875
              precision    recall  f1-score   support

    Negative       0.77      0.43      0.55       894
    Positive       0.90      0.98      0.94      4798

    accuracy                           0.89      5692
   macro avg       0.84      0.70      0.75      5692
weighted avg       0.88      0.89      0.88      5692



In [118]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}


In [119]:
f1_scorer = make_scorer(f1_score, pos_label='Positive')

In [120]:
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring=f1_scorer, cv=5)

In [None]:
grid_search.fit(X_train_tfidf, y_train)

In [None]:
print("Best Hyperparameters:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)

In [None]:
best_rf_model = grid_search.best_estimator_

In [None]:
best_rf_model.fit(X_train_tfidf, y_train)

In [None]:
y_pred_val = best_rf_model.predict(X_val_tfidf)
f1_val = f1_score(y_test, y_pred_val, pos_label='Positive')

In [None]:

print("F1 score on validation set:", f1_val)


In [None]:
feature_importance = best_rf_model.feature_importances_

In [None]:
feature_names = tfidf_vectorizer.get_feature_names_out()

In [None]:
feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

In [None]:
feature_df = feature_df.sort_values(by='Importance', ascending=False)

In [None]:
N = 10

In [None]:
top_features = feature_df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=top_features)
plt.title('Top ' + str(N) + ' Important Features')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.show()

In [None]:
# Import necessary libraries
from wordcloud import WordCloud
# Create a WordCloud for positive reviews
positive_reviews_text = ' '.join(df[df['sentiment'] == 'Positive']['cleaned_review_text'])
wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive_reviews_text)
# Display the WordCloud for positive reviews
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Positive Reviews')
plt.show()


In [None]:
# Import confusion_matrix from sklearn.metrics
from sklearn.metrics import confusion_matrix
# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred_val)
# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


In [None]:
import joblib

# Save the trained model
joblib.dump(best_rf_model, '/content/drive/MyDrive/irl/webs/best_rf_model.joblib')

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, '/content/drive/MyDrive/irl/webs/tfidf_vectorizer.joblib')
