In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [2]:
import pandas as pd

# Load the dataset
file_path = './drive/MyDrive/Colab Notebooks/CS410/data/merged.csv'

steam_data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
# steam_data.head()
print(steam_data.shape)
print(steam_data.columns)

(1151433, 21)
Index(['Unnamed: 0', 'recommendationid', 'language', 'review',
       'timestamp_created', 'timestamp_updated', 'voted_up', 'votes_up',
       'votes_funny', 'weighted_vote_score', 'comment_count', 'steam_purchase',
       'received_for_free', 'written_during_early_access', 'author.steamid',
       'author.num_games_owned', 'author.num_reviews',
       'author.playtime_forever', 'author.playtime_last_two_weeks',
       'author.playtime_at_review', 'author.last_played'],
      dtype='object')


# Data Shuffle and Re-sampling

In [3]:
steam_shuffled = steam_data.sample(frac=1).reset_index(drop=True)

steam_shuffled = steam_shuffled.sample(frac=0.01).reset_index(drop=True)
print(steam_shuffled.shape)

(11514, 21)


# Label the data with sentiment

In [4]:
steam_filtered = steam_shuffled

# Assign labels to the reviews (1 for positive and 0 for negative)
steam_filtered['sentiment'] = (steam_data['voted_up'] == True).astype(int)

# Display the distribution of the sentiments and the first few rows of the new dataframe
sentiment_distribution = steam_filtered['sentiment'].value_counts()
steam_filtered[['review', 'voted_up', 'sentiment']].head(), sentiment_distribution

(                                              review  voted_up  sentiment
 0                              is good gamei like it      True          1
 1  It's definitely a game to play. It's one of th...      True          1
 2             online jank as shit, but it's aoe2 yo.      True          1
 3  This game is very immersive and allows many si...      True          1
 4  Everyone posts negative hate against this game...      True          1,
 1    6281
 0    5233
 Name: sentiment, dtype: int64)

# Preprocess the text and create training and testing set

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Download stopwords from NLTK
nltk.download('stopwords')

# Function to preprocess the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Stem the words
    stemmer = SnowballStemmer('english')
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
from sklearn.model_selection import train_test_split

# Apply the preprocessing function to the review texts
steam_filtered['review'] = steam_filtered['review'].apply(preprocess_text)

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(
    steam_filtered['review'],
    steam_filtered['sentiment'],
    test_size=0.2,
    random_state=42
)

trained_models = dict()

In [7]:
steam_filtered[['review', 'voted_up', 'sentiment']].head(), sentiment_distribution

(                                              review  voted_up  sentiment
 0                                    good gamei like      True          1
 1  definit game play one best game excit detail c...      True          1
 2                             onlin jank shit aoe yo      True          1
 3  game immers allow mani simul game also allow p...      True          1
 4  everyon post negat hate game yet still alpha b...      True          1,
 1    6281
 0    5233
 Name: sentiment, dtype: int64)

# Calculate most common words

In [8]:
import pandas as pd
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Assuming you have a DataFrame 'steam_filtered' with 'review' and 'sentiment' columns
N_WORDS = 20
# Function to get top words for a specific sentiment
def get_top_words(df, sentiment_value=None):
    if sentiment_value is not None:
        df = df[df['sentiment'] == sentiment_value]
    reviews = ' '.join(df['review'].values)  # Merge all reviews into one string
    words_list = word_tokenize(reviews)  # Tokenize the merged string
    word_counts = Counter(words_list)
    total_words = sum(word_counts.values())
    top_words = word_counts.most_common(N_WORDS)

    top_words_with_percentage = []
    for word, count in top_words:
        frequency_percentage = (count / total_words) * 100
        top_words_with_percentage.append((word, count, frequency_percentage))

    return top_words_with_percentage

# Calculate top words with frequency percentage for negative (sentiment = 0), positive (sentiment = 1), and all reviews
top_negative_words = get_top_words(steam_filtered, 0)
top_positive_words = get_top_words(steam_filtered, 1)
top_all_words = get_top_words(steam_filtered)

print("\nTop words indicating Positive sentiment with frequency percentage:")
for word, count, percentage in top_positive_words:
    print(f"{word}: Count - {count}, Percentage - {percentage:.2f}%")

print("\nTop words indicating Negative sentiment with frequency percentage:")
for word, count, percentage in top_negative_words:
    print(f"{word}: Count - {count}, Percentage - {percentage:.2f}%")

print("\nTop words in the entire dataset with frequency percentage:")
for word, count, percentage in top_all_words:
    print(f"{word}: Count - {count}, Percentage - {percentage:.2f}%")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.



Top words indicating Positive sentiment with frequency percentage:
game: Count - 10805, Percentage - 5.36%
play: Count - 2902, Percentage - 1.44%
get: Count - 2153, Percentage - 1.07%
like: Count - 1984, Percentage - 0.98%
time: Count - 1507, Percentage - 0.75%
good: Count - 1433, Percentage - 0.71%
fun: Count - 1342, Percentage - 0.67%
dont: Count - 1165, Percentage - 0.58%
even: Count - 1103, Percentage - 0.55%
one: Count - 1088, Percentage - 0.54%
make: Count - 1072, Percentage - 0.53%
realli: Count - 1014, Percentage - 0.50%
still: Count - 955, Percentage - 0.47%
would: Count - 901, Percentage - 0.45%
great: Count - 867, Percentage - 0.43%
hour: Count - 848, Percentage - 0.42%
much: Count - 825, Percentage - 0.41%
buy: Count - 783, Percentage - 0.39%
go: Count - 773, Percentage - 0.38%
want: Count - 760, Percentage - 0.38%

Top words indicating Negative sentiment with frequency percentage:
game: Count - 9327, Percentage - 5.32%
play: Count - 2342, Percentage - 1.34%
like: Count - 

# Models

## MLPClassifier

In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming X_train and y_train are training data and labels,
# and X_test and y_test are testing data and labels.

# Initialize the TfidfVectorizer with n-gram range
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

# Fit and transform the training data and transform the testing data
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

# Initialize the MLPClassifier
mlp_model = MLPClassifier(random_state=42)

# Train the model with the training data
mlp_model.fit(x_train_tfidf, y_train)

# Predict the sentiments for the test data
y_pred = mlp_model.predict(x_test_tfidf)

# Calculate the accuracy of the predictions and print the classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)

Accuracy: 0.5223621363438993
              precision    recall  f1-score   support

           0       0.47      0.46      0.46      1048
           1       0.56      0.58      0.57      1255

    accuracy                           0.52      2303
   macro avg       0.52      0.52      0.52      2303
weighted avg       0.52      0.52      0.52      2303



In [10]:
# Preprocess the new sentence using the defined preprocess_text function
text = """
The game sucks
"""
new_sentence_to_predict = preprocess_text(text)

# Transform the preprocessed sentence to TF-IDF vector using the fitted tfidf_vectorizer
new_sentence_tfidf = tfidf_vectorizer.transform([new_sentence_to_predict])

# Predict the sentiment using the trained MLP model
new_sentence_pred = mlp_model.predict(new_sentence_tfidf)

# Output the prediction (1 for positive, 0 for negative)
predicted_sentiment = "Positive" if new_sentence_pred[0] == 1 else "Negative"
predicted_sentiment

'Negative'

In [14]:
text2 = """
The game is great
"""
new_sentence_to_predict2 = preprocess_text(text2)

# Transform the preprocessed sentence to TF-IDF vector using the fitted tfidf_vectorizer
new_sentence_tfidf2 = tfidf_vectorizer.transform([new_sentence_to_predict2])

# Predict the sentiment using the trained MLP model
new_sentence_pred2 = mlp_model.predict(new_sentence_tfidf2)

# Output the prediction (1 for positive, 0 for negative)
predicted_sentiment2 = "Positive" if new_sentence_pred2[0] == 1 else "Negative"
predicted_sentiment2

'Positive'

## CNN

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def train_cnn_model():
  # Parameters
  vocab_size = 10000  # Adjust as necessary
  embedding_dim = 64  # Adjust as necessary
  max_length = 200  # Should be close to the actual length of the preprocessed text
  filter_sizes = 128  # The number of output filters in the convolution
  kernel_size = 5  # The length of the convolution window

  trunc_type='post'
  padding_type='post'
  oov_tok = "<OOV>"

  # Tokenize and pad sequences
  tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
  tokenizer.fit_on_texts(x_train)

  word_index = tokenizer.word_index
  train_sequences = tokenizer.texts_to_sequences(x_train)
  train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

  validation_sequences = tokenizer.texts_to_sequences(x_test)
  validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


  # Define the CNN model
  cnn_model = Sequential([
      Embedding(vocab_size, embedding_dim, input_length=max_length),
      Conv1D(filter_sizes, kernel_size, activation='relu'),
      GlobalMaxPooling1D(),
      Dense(10, activation='relu'),
      Dropout(0.2),
      Dense(1, activation='sigmoid')  # Use 'softmax' for multi-class classification
  ])

  # Compile the model
  cnn_model.compile(loss='binary_crossentropy',  # Or 'categorical_crossentropy' for a multi-class problem
                optimizer='adam',
                metrics=['accuracy'])

  # Summary of the model
  cnn_model.summary()

  # Train the model
  num_epochs = 10
  history = cnn_model.fit(train_padded, y_train,
                      epochs=num_epochs,
                      validation_data=(validation_padded, y_test))

  return cnn_model

cnn_model = train_cnn_model()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 64)           640000    
                                                                 
 conv1d (Conv1D)             (None, 196, 128)          41088     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                        

## LSTM

In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def train_lstm_model():
  # Parameters
  vocab_size = 10000  # This should be adjusted to the size of your vocabulary
  embedding_dim = 64  # This can be tuned
  max_length = 200  # This should be set to the length that covers most of the data or based on exploratory analysis
  trunc_type='post'
  padding_type='post'
  oov_tok = "<OOV>"

  # Tokenize and pad sequences
  tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
  tokenizer.fit_on_texts(x_train)

  word_index = tokenizer.word_index
  train_sequences = tokenizer.texts_to_sequences(x_train)
  train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

  validation_sequences = tokenizer.texts_to_sequences(x_test)
  validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

  # Define the LSTM model
  model = Sequential([
      Embedding(vocab_size, embedding_dim, input_length=max_length),
      LSTM(64, return_sequences=True),
      Dropout(0.2),
      LSTM(32),
      Dense(24, activation='relu'),
      Dropout(0.2),
      Dense(1, activation='sigmoid')  # Use 'softmax' for multiclass classification
  ])

  # Compile the model
  model.compile(loss='binary_crossentropy',  # Use 'categorical_crossentropy' for multiclass classification
                optimizer='adam',
                metrics=['accuracy'])

  # Summary of the model
  model.summary()

  # Train the model
  num_epochs = 10
  history = model.fit(train_padded, y_train,
                      epochs=num_epochs,
                      validation_data=(validation_padded, y_test))

  return model

lstm_model = train_lstm_model()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 64)           640000    
                                                                 
 lstm (LSTM)                 (None, 200, 64)           33024     
                                                                 
 dropout_1 (Dropout)         (None, 200, 64)           0         
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense_2 (Dense)             (None, 24)                792       
                                                                 
 dropout_2 (Dropout)         (None, 24)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                

KeyboardInterrupt: ignored

## Logistic Regression, SVM,  Naive Bayes Classifier, LinearSVC, RandomForestClassifier, XGBClassifier

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

def train_and_evaluate_classifiers(x_train, y_train, x_test, y_test):
    classifiers = {
        'SVM': SVC(kernel='linear'),
        'LogisticRegression': LogisticRegression(),
        'NaiveBayes': MultinomialNB(),
        'LinearSVC': LinearSVC(),
        'RandomForest': RandomForestClassifier(),
        'XGBoost': XGBClassifier(),
        'GradientBoosting': GradientBoostingClassifier(),
        'AdaBoost': AdaBoostClassifier(),
        'DecisionTree': DecisionTreeClassifier(),
        'KNN': KNeighborsClassifier()
    }

    trained_classifiers = {}

    for clf_name, clf in classifiers.items():
        text_clf = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', clf)
        ])
        text_clf.fit(x_train, y_train)
        trained_classifiers[clf_name] = text_clf

        predicted = text_clf.predict(x_test)

        print(f"Classifier: {clf_name}")
        print(classification_report(y_test, predicted))
        print(f"Accuracy: {accuracy_score(y_test, predicted)}")
        print("-----------------------------")

    return trained_classifiers

# Usage example:

# Assuming x_train, y_train, x_test, y_test are defined
sklearn_trained_models = train_and_evaluate_classifiers(x_train, y_train, x_test, y_test)

In [None]:
# Use a trained model for prediction
trained_models.update(sklearn_trained_models)
trained_models['MLP'] = mlp_model
trained_models['CNN'] = cnn_model
trained_models['LSTM'] = lstm_model

MAX_LENGTH = 200  # Should be close to the actual length of the preprocessed text
TRUNC_TYPE='post'
PADDING_TYPE='post'

new_text = "The game sucks"
preprocessed_text = preprocess_text(new_text)  # preprocess_text is your custom preprocessing function

# Choose a model for prediction (e.g., SVM as an example)
tempting_models = ['SVM', 'LogisticRegression', 'NaiveBayes', 'LinearSVC', 'RandomForest', 'XGBoost',
                   'GradientBoosting', 'AdaBoost', 'DecisionTree', 'KNN', 'MLP']

sentiment_labels = []

for model in tempting_models:

  if model == 'MLP':
    # Transform the preprocessed sentence to TF-IDF vector using the fitted tfidf_vectorizer
    new_sentence_tfidf = tfidf_vectorizer.transform([preprocessed_text])

    # Predict the sentiment using the trained MLP model
    predicted_sentiment = mlp_model.predict(new_sentence_tfidf)

  # elif model == 'CNN':
  #   # Convert the text to a sequence
  #   input_sequence = tokenizer.texts_to_sequences([preprocessed_text])

  #   # Pad the sequence
  #   input_padded = pad_sequences(input_sequence, maxlen=MAX_LENGTH, padding=TRUNC_TYPE, truncating=PADDING_TYPE)

  #   # Predict the sentiment
  #   prediction = model.predict(input_padded)

  #   # Interpret the output
  #   # Since it's a binary classification, we can use 0.5 as a threshold to interpret the sentiment
  #   sentiment_labels = "positive" if prediction[0][0] > 0.5 else "negative"

  else:

    selected_model = trained_models[model]
    predicted_sentiment = selected_model.predict([preprocessed_text])

    sentiment_label = 'positive' if predicted_sentiment[0] == 1 else 'negative'


  print(f"The predicted sentiment for '{new_text}' using {model} is '{sentiment_label}'.")

In [None]:
def contains_no_letters(text):
   # Check if the text contains no letters
   return not any(char.isalpha() for char in text)
   # Remove rows with no letters in 'review'
   df = df[~df['review'].apply(contains_no_letters)]

Model for Votes Up

In [16]:
import numpy as np

np.count_nonzero(steam_filtered['votes_up'])
np.sum(steam_filtered['votes_up'] > 0)

# Divide votes_up into two cases (1 for >=5 and 0 for <5)
steam_filtered['useful'] = (steam_data['votes_up'] >= 5).astype(int)

# Display the distribution of the sentiments and the first few rows of the new dataframe
sentiment_distribution = steam_filtered['useful'].value_counts()
steam_filtered[['review', 'votes_up', 'useful']].head(), sentiment_distribution

(                                              review  votes_up  useful
 0                                    good gamei like         0       0
 1  definit game play one best game excit detail c...         0       0
 2                             onlin jank shit aoe yo         0       0
 3  game immers allow mani simul game also allow p...         0       0
 4  everyon post negat hate game yet still alpha b...         1       0,
 0    10636
 1      878
 Name: useful, dtype: int64)

In [17]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
x_useful_train, x_useful_test, y_useful_train, y_useful_test = train_test_split(
    steam_filtered['review'],
    steam_filtered['useful'],
    test_size=0.2,
    random_state=42
)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming X_train and y_train are training data and labels,
# and X_test and y_test are testing data and labels.

# Initialize the TfidfVectorizer with n-gram range
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

# Fit and transform the training data and transform the testing data
x_useful_train_tfidf = tfidf_vectorizer.fit_transform(x_useful_train)
x_useful_test_tfidf = tfidf_vectorizer.transform(x_useful_test)

# Initialize the MLPClassifier
mlp_useful_model = MLPClassifier(random_state=42)

# Train the model with the training data
mlp_useful_model.fit(x_useful_train_tfidf, y_useful_train)

# Predict the sentiments for the test data
y_useful_pred = mlp_useful_model.predict(x_useful_test_tfidf)

# Calculate the accuracy of the predictions and print the classification report
useful_accuracy = accuracy_score(y_useful_test, y_useful_pred)
useful_report = classification_report(y_useful_test, y_useful_pred)

print(f"Accuracy: {useful_accuracy}")
print(useful_report)



In [None]:
# Preprocess the new sentence using the defined preprocess_text function
useful_text = """
The game sucks
"""
new_sentence_to_predict_useful = preprocess_text(useful_text)

# Transform the preprocessed sentence to TF-IDF vector using the fitted tfidf_vectorizer
new_useful_sentence_tfidf = tfidf_vectorizer.transform([new_sentence_to_predict_useful])

# Predict the sentiment using the trained MLP model
new_useful_sentence_pred = mlp_useful_model.predict(new_useful_sentence_tfidf)

# Output the prediction (1 for positive, 0 for negative)
predicted_useful_sentiment = "Positive" if new_useful_sentence_pred[0] == 1 else "Negative"
predicted_useful_sentiment

Model for Votes Funny

In [None]:
import numpy as np

np.count_nonzero(steam_filtered['votes_funny'])
np.sum(steam_filtered['votes_funny'] > 0)

# Divide votes_up into two cases (1 for >=5 and 0 for <5)
steam_filtered['isFunny'] = (steam_data['votes_funny'] >= 5).astype(int)

# Display the distribution of the sentiments and the first few rows of the new dataframe
sentiment_distribution = steam_filtered['isFunny'].value_counts()
steam_filtered[['review', 'votes_funny', 'isFunny']].head(), sentiment_distribution

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
x_funny_train, x_funny_test, y_funny_train, y_funny_test = train_test_split(
    steam_filtered['review'],
    steam_filtered['isFunny'],
    test_size=0.2,
    random_state=42
)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming X_train and y_train are training data and labels,
# and X_test and y_test are testing data and labels.

# Initialize the TfidfVectorizer with n-gram range
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

# Fit and transform the training data and transform the testing data
x_funny_train_tfidf = tfidf_vectorizer.fit_transform(x_funny_train)
x_funny_test_tfidf = tfidf_vectorizer.transform(x_funny_test)

# Initialize the MLPClassifier
mlp_funny_model = MLPClassifier(random_state=42)

# Train the model with the training data
mlp_funny_model.fit(x_funny_train_tfidf, y_funny_train)

# Predict the sentiments for the test data
y_funny_pred = mlp_funny_model.predict(x_funny_test_tfidf)

# Calculate the accuracy of the predictions and print the classification report
funny_accuracy = accuracy_score(y_funny_test, y_funny_pred)
funny_report = classification_report(y_funny_test, y_funny_pred)

print(f"Accuracy: {funny_accuracy}")
print(funny_report)



In [None]:
# Preprocess the new sentence using the defined preprocess_text function
funny_text = """
The game sucks
"""
new_sentence_to_predict_funny = preprocess_text(funny_text)

# Transform the preprocessed sentence to TF-IDF vector using the fitted tfidf_vectorizer
new_funny_sentence_tfidf = tfidf_vectorizer.transform([new_sentence_to_predict_funny])

# Predict the sentiment using the trained MLP model
new_funny_sentence_pred = mlp_funny_model.predict(new_funny_sentence_tfidf)

# Output the prediction (1 for positive, 0 for negative)
predicted_funny_sentiment = "Positive" if new_funny_sentence_pred[0] == 1 else "Negative"
predicted_funny_sentiment