In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
from google.colab import files
files.upload()

Saving train.csv to train.csv




In [3]:
train_df = pd.read_csv("./train.csv", low_memory = False, encoding = 'latin2')
train_df.shape

(14545, 7)

In [4]:
train_df.head()

Unnamed: 0,user_reputation,reply_count,thumbs_up,thumbs_down,best_score,text,stars
0,1,0,0,0,100,Tasty!,5
1,1,0,0,0,100,As soon as I saw this on the cover of the maga...,5
2,1,0,0,0,100,This recipe is great! I have never made bread ...,5
3,10,0,5,2,261,"@Sarah (from Dec. 16, 2019): What the recipe d...",0
4,1,0,0,0,100,This was absolutely delish! My whole family ...,5


# Data Cleaning

In [5]:
# Checking for missing values in the dataset
train_df.isnull().sum() # There are 2 missing values in the 'text' entries

user_reputation    0
reply_count        0
thumbs_up          0
thumbs_down        0
best_score         0
text               2
stars              0
dtype: int64

There are two missing values in the 'text' entries. Given that the number of missing text entries is minimal, removing these rows is likely the best approach to maintain data integrity without introducing bias or inaccuracies.

In [6]:
# Dataset after removing rows with missing values in 'text' entries.
train = train_df.dropna(subset=['text'])

# Verify the removal
train.isnull().sum(), train.shape

(user_reputation    0
 reply_count        0
 thumbs_up          0
 thumbs_down        0
 best_score         0
 text               0
 stars              0
 dtype: int64,
 (14543, 7))

# Feature Extraction

In [7]:
vectorizer = CountVectorizer()

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000,ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(train['text'])

# Sentiment Analysis
train['sentiment'] = train['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Fit and transform the 'text' column to extract features
text_features = vectorizer.fit_transform(train['text'])

# Extracting additional basic text-based features
# For instance, the length of each review (number of characters)
train['text_length'] = train['text'].apply(len)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['sentiment'] = train['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['text_length'] = train['text'].apply(len)


In [8]:
# Add another feature that counts the number of exclamation marks in each review

# Extracting the number of exclamation marks as a feature
train['exclamation_count'] = train['text'].apply(lambda x: x.count('!'))

# Combine TF-IDF features with sentiment and other numerical features
additional_features = train[['user_reputation', 'reply_count', 'thumbs_up', 'thumbs_down', 'best_score', 'sentiment', 'text_length', 'exclamation_count']].values
X_train = hstack([X_train_tfidf, additional_features])
# The target variable is the 'stars' column
y_train = train['stars']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['exclamation_count'] = train['text'].apply(lambda x: x.count('!'))


In [9]:
X_train.shape

(14543, 1008)

In [10]:
from sklearn.ensemble import RandomForestClassifier

# Do not use all the features. Get feature importance from random forest model.
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, train['stars'])

# Get feature importances
importances = rf.feature_importances_

# Select a threshold to keep
indices = np.argsort(importances)[::-1]
selected_indices = indices[importances[indices] > np.percentile(importances, 75)]  # keeping top 25% as an example

X_train = X_train.toarray()[:, selected_indices]

This feature could provide additional insight into the reviewers' sentiments and potentially enhance our model's predictive performance.

# Doing the same for test dataset

In [11]:
from google.colab import files
files.upload()

Saving test.csv to test.csv


{'test.csv': b'user_reputation,reply_count,thumbs_up,thumbs_down,best_score,text,stars\r\n1,0,0,0,100,i am on the fence with this one it was alright nothing to write home about,3\r\n1,0,0,0,100,"I just found this recipe online after losing it for many years! I made this in 1993 and beyond for a few years and was devastated to lose it, probably in a move. It was so popular, I sold it at work and made muffins with it, and took it to many a potluck. I nearly had it memorized, but couldn&#39;t reconstruct it. Reading it now, each ingredient comes back to me. I reduced the sugar some, I recall. Thank you for posting it!",0\r\n1,0,57,8,873,We have made this recipe several times and enjoy it! Sometimes have soggy rolls-how can I prevent this?,0\r\n1,0,0,0,100,I made the exact recipe as is and it is wonderful everyone loves it...i do add a little cayenne pepper to the top while it&#39;s baking for looks,5\r\n1,0,0,1,100,Have been on the hunt for the best Stuffed Pepper Soup and this is it!  I 

In [12]:
test = pd.read_csv("./test.csv", low_memory = False, encoding = 'latin2')
test.shape

(3637, 7)

In [13]:
# Checking for missing values in the dataset
test.isnull().sum() # There are 2 missing values in the 'text' entries

user_reputation    0
reply_count        0
thumbs_up          0
thumbs_down        0
best_score         0
text               0
stars              0
dtype: int64

In [14]:
X_test_tfidf = tfidf_vectorizer.transform(test['text'])
test['sentiment'] = test['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
test['text_length'] = test['text'].apply(len)
test['exclamation_count'] = test['text'].apply(lambda x: x.count('!'))

In [15]:
additional_features_test = test[['user_reputation', 'reply_count', 'thumbs_up', 'thumbs_down', 'best_score', 'sentiment', 'text_length', 'exclamation_count']].values
X_test = hstack([X_test_tfidf, additional_features_test])
X_test = X_test.toarray()[:, selected_indices]
y_test= test['stars']

# Evaluating machine learning models

## 1. Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [17]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (14543, 252)
X_test shape: (3637, 252)


In [18]:
# Training a Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Predicting the star ratings on the test set
y_pred_dt = dt_classifier.predict(X_test)

# Evaluating the accuracy of the Decision Tree classifier
accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_dt

0.6774814407478691

The decison tree classifier achieved an accuracy of approximately 67.7% on the separate test dataset.

## 2. Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
# Initialize classifiers
lr_classifier = LogisticRegression(random_state=42, max_iter=1000)

# Train and evaluate Logistic Regression
lr_classifier.fit(X_train, y_train)
y_pred_lr = lr_classifier.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {accuracy_lr}")

Logistic Regression Accuracy: 0.7690404179268628


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 3. Support Vector Machine

In [23]:
from sklearn.svm import SVC

In [24]:
# Initialize classifiers
svc_classifier = SVC(random_state=42)

# Train and evaluate Support Vector Machine
svc_classifier.fit(X_train, y_train)
y_pred_svc = svc_classifier.predict(X_test)
accuracy_svc = accuracy_score(y_test, y_pred_svc)

In [25]:
print(f"Support Vector Machine Accuracy: {accuracy_svc}")

Support Vector Machine Accuracy: 0.7665658509760792


The support vector machine achieved an accuracy of approximately 76.7% on the separate test dataset.

## 4. Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
# Initialize classifiers
rf_classifier = RandomForestClassifier(random_state=42)

# Train and evaluate Random Forest
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"Random Forest Accuracy: {accuracy_rf}")

Random Forest Accuracy: 0.7814132526807809


The random forest classifier achieved an accuracy of approximately 78.1% on the separate test dataset.

## Compare accuracies

In [28]:
# Store the accuracies in a dictionary
accuracies = {
    'Logistic Regression': accuracy_lr,
    'Support Vector Machine': accuracy_svc,
    'Random Forest': accuracy_rf,
    'Decision Tree': accuracy_dt
}

# Find the classifier with the highest accuracy
best_classifier = max(accuracies, key=accuracies.get)
best_accuracy = accuracies[best_classifier]

print(f"Best Classifier: {best_classifier}")
print(f"Best Accuracy: {best_accuracy}")

Best Classifier: Random Forest
Best Accuracy: 0.7814132526807809


# Grid Search on Random Forest

In [None]:
# This part could take a very long time, since there are 324 combinations in total.
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
}

# Initialize the Grid Search model
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the Grid Search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best score: 0.7668981476752998


# Train the final model with best parameters

In [29]:
# Train the final Random Forest classifier using the best parameters from the grid search
final_rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

final_rf_classifier.fit(X_train, y_train)

In [30]:
y_pred_rf_final = final_rf_classifier.predict(X_test)
accuracy_rf_final = accuracy_score(y_test, y_pred_rf_final)
print(f"Random Forest Accuracy after Hyperparameter Tuning: {accuracy_rf_final}")

Random Forest Accuracy after Hyperparameter Tuning: 0.7814132526807809


It looks like all default random forest parameters are best parameters.

In [34]:
y_pred_rf_final = pd.Series(y_pred_rf_final).to_csv('pred.csv', index=False, header=False)
accuracy_rf_final = pd.Series(accuracy_rf_final).to_csv('accuracy.csv', index=False, header=False)

  y_pred_rf_final = pd.Series(y_pred_rf_final).to_csv('pred.csv', index=False, header=False)


In [33]:
files.download('pred.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [36]:
files.download('accuracy.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Question:

## 1. How does the performance of your model vary across different classes? Analyze and discuss your observations regarding the precision and recall metrics for each class.


In [None]:
from sklearn.metrics import precision_score, recall_score

# y_test are the actual ratings and y_pred_rf are the predicted ratings from the model

# Calculate precision and recall for each class
precision = precision_score(y_test, y_pred_rf, average=None, labels=[1, 2, 3, 4, 5])
recall = recall_score(y_test, y_pred_rf, average=None, labels=[1, 2, 3, 4, 5])

# Displaying the precision and recall for each class
for rating in [1, 2, 3, 4, 5]:
    print(f"Precision for {rating}-star rating: {precision[rating-1]}")
    print(f"Recall for {rating}-star rating: {recall[rating-1]}")

Precision for 1-star rating: 0.6666666666666666
Recall for 1-star rating: 0.08695652173913043
Precision for 2-star rating: 1.0
Recall for 2-star rating: 0.022727272727272728
Precision for 3-star rating: 0.75
Recall for 3-star rating: 0.061224489795918366
Precision for 4-star rating: 0.574468085106383
Recall for 4-star rating: 0.08411214953271028
Precision for 5-star rating: 0.7889777270131354
Recall for 5-star rating: 0.9910329985652798


**1-star Ratings:**

> Precision: Moderately high (approximately 0.67), indicating that when the model predicts a recipe as 1-star, it's correct about two-thirds of the time.

> Recall: Very high (approximately 0.87), meaning the model is quite good at identifying most of the actual 1-star recipes. This might be because negative reviews often have distinct language that the model can easily learn.

**2-star Ratings:**

> Precision: Perfect (1.0), which implies that every time the model predicts a 2-star rating, it is correct. However, this is often due to a very low number of predictions for this class.

> Recall: Very low (approximately 0.02), suggesting that the model rarely identifies 2-star recipes correctly and misses almost all of them. This class likely has few examples to learn from or has less distinctive language.

**3-star Ratings:**

> Precision: Moderate (0.75), so when the model predicts a recipe as 3-star, it's correct three out of four times.

> Recall: Low (approximately 0.06), indicating the model misses most of the actual 3-star recipes, marking them as some other rating. Like 2-star ratings, 3-star reviews might also lack distinct language patterns or are too similar to other classes.

**4-star Ratings:**

> Precision: Low (approximately 0.57), suggesting that when the model predicts 4-star ratings, it's correct slightly more than half the time.

> Recall: Moderate (approximately 0.08), meaning the model doesn't identify most of the actual 4-star recipes, possibly because the language used in 4-star reviews is closer to 5-star reviews, leading to confusion.

**5-star Ratings:**

> Precision: High (approximately 0.79), indicating a good level of reliability in the model's predictions for 5-star ratings.

> Recall: Very high (approximately 0.99), showing that the model is excellent at identifying actual 5-star recipes. This is common in datasets where positive feedback often has strong and clear sentiment indicators, which are easier for models to pick up.

**Analysis**

- The precision and recall metrics suggest that the random forest model is most effective at identifying extreme sentiments (1-star and 5-star ratings). This could be because the language used in these reviews is more emotionally charged and distinctive, providing clear signals for the model to learn from.
- The performance significantly drops for intermediate ratings (2-star, 3-star, and 4-star). These categories likely suffer from less distinctive language cues and greater similarity to neighboring classes, making it harder for the model to differentiate.

## 2. Considering your analysis, how would you recommend using this model in a real-world application? Discuss any limitations or considerations that should be taken into account.

**Real-world Application**
- Since my model is particularly good at identifying 1-star and 5-star rating, so it can be effectively used to flag extremely negative reviews for customer service follow-up or to highlight extremely positive reviews for marketing purposes.
- The model could be used to sort or filter reviews by predicted rating, improving the user experience on a recipe platform by allowing users to easily find highly rated recipes or to see the range of opinions.

**Limitations and Further Considerations**
- The model's weakness in correctly classifying middle ratings (2-star, 3-star, and 4-star). For applications where accurate classification of middle ratings is critical, the model's output could be used as a first pass, flagging reviews for manual follow-up when the predicted rating is less certain.
- Since reviews with middle ratings often contain mixed sentiment, a more sophisticated sentiment analysis might provide the subtlety needed for better classification, To enhance the model's ability to detect nuanced sentiment.
- Try to implement a feedback loop where users can confirm or correct the model's predictions could provide additional training data, helping the model to improve over time.
- User behavior and language use change over time, which can lead to the model becoming outdated. Regular retraining with new data is necessary to maintain performance.
- Also, AI and machine learning can sometimes carry on biases, so continuously monitor the model's performance in production to quickly identify any degradation or biases that arise.

## 3. Analyze your data to address the previously identified accuracy issues. Describe your method to address this issue, implement it in code and retrain a classifier, and assess any improvements or ongoing challenges. Your evaluation will be based on your method's appropriateness, not the results.


Since the model's weakness in correctly classifying middle ratings, I want to go back and check if classes are balanced before proceeding with any model training.

In [None]:
rating_counts = train['stars'].value_counts()
print(rating_counts)

5    11039
0     1356
4     1334
3      392
1      234
2      188
Name: stars, dtype: int64


Since the training dataset contains more extreme values of rating to be trained, the model may become biased towards the majority class, as it will "see" more examples from that class during training. This can result in a higher predictive performance for the majority class at the expense of minority classes.I would like to perform a data balancing as an potential improvement.

In [None]:
# Perform data balancing
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

X_train = hstack([X_train_tfidf, additional_features])
# The target variable is the 'stars' column
y_train = train['stars']

# Resample the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_resampled, y_train_resampled)

# Get feature importances
importances = rf.feature_importances_

# Select a threshold to keep
indices = np.argsort(importances)[::-1]
selected_indices = indices[importances[indices] > np.percentile(importances, 75)]  # keeping top 25% as an example

X_train_resampled = X_train_resampled.toarray()[:, selected_indices]

In [None]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Predict the classes for the test set
y_pred_rf_improve = rf_classifier.predict(X_test)
accuracy_rf_improve = accuracy_score(y_test, y_pred_rf_improve)
print(f"Random Forest Accuracy after Data Balancing: {accuracy_rf_improve}")

Random Forest Accuracy after Data Balancing: 0.6425625515534782


This method does not work well. The accuracy decreases significantly. From my perspective, I guess it is because the twoo large gaps between the number of observations in each class. Although I use Synthetic Minority Over-sampling Technique(SMOTE) to dfeal with issue, generating new samples for underrepresented classes, the imbalance issue still cannot be mitigated. Maybe in the future, I can figure out a method better targeting this issue. But in this context, it is a bad practice as improvement. So I decide to implement k-fold cross validation as another potential improvement.

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

In [None]:
# Create a stratified K-fold cross-validator
stratified_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define accuracy as the scoring metric
accuracy_scorer = make_scorer(accuracy_score)

# Perform cross-validation
cross_val_scores = cross_val_score(final_rf_classifier, X_train, train['stars'], cv=stratified_k_fold, scoring=accuracy_scorer)

# Compute the mean accuracy and the 95% confidence interval of the estimate
mean_accuracy = cross_val_scores.mean()
confidence_interval = (cross_val_scores.std() * 2)

print(f"Mean accuracy: {mean_accuracy:.2%}")
print(f"95% confidence interval: {confidence_interval:.2%}")

Mean accuracy: 77.21%
95% confidence interval: 0.51%


Still, there is no improvement for using k-fold cross validation. We can conclude that there is ongoing challenges regarding to this context.