In [None]:
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
nltk.download('stopwords')


In [None]:
data = pd.read_csv('apple-twitter-sentiment-texts.csv')
data.head()

In [None]:
for i in range(len(data.sentiment)):
    if data.sentiment[i] == -1:
        data["sentiment"][i] = "negative"
    elif data.sentiment[i] == 0:
        data["sentiment"][i] = "neutral"
    else:
        data["sentiment"][i] = "positive"

data.head()

In [None]:
data["sentiment"].value_counts()

In [None]:
sns.countplot(data["sentiment"])
plt.xlabel("Count")
plt.ylabel("Sentiment")
plt.text(0.5, -0.1, 'Figure 1: Sentiment Distribution', size=12, ha='center', transform=plt.gcf().transFigure)
plt.show()


In [None]:
text = " ".join(review for review in data.text)
print ("There are {} words in the combination of all review.".format(len(text)))

In [None]:
# Generate a word cloud image
wordcloud = WordCloud(background_color="white").generate(text)

# Display the generated image
plt.figure(figsize=(12,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.text(0.5, -0.05, 'Figure 2: Word Cloud', size=12, ha='center', transform=plt.gcf().transFigure)
plt.show()


In [None]:
data.iloc[1].text

In [None]:
data.iloc[0].text

In [None]:
def remove_chars():
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|#[a-zA-Z]+|$[a-zA-Z]+|@[a-zA-Z]+|[,.^_$*%-;鶯!?:]')
    for i in range(len(data["text"])):
        data["text"][i] = pattern.sub('', data["text"][i])
remove_chars()

data.head()

In [None]:
data_test = data.copy()
stop = stopwords.words('english')
data_test["text"] = data_test["text"].str.lower().str.split()
data_test["text"] = data_test["text"].apply(lambda x: [item for item in x if item not in stop])
data_test.head()

In [None]:
data_test = data_test.assign(text=data_test.text.map(' '.join))

In [None]:
data_test.head()

In [None]:
max_words = 500
max_len= 20

def tokenize_pad_sequences(text):
    '''
    This function tokenize the input text into sequnences of intergers and then
    pad each sequence to the same length
    '''
    # Text tokenization
    tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
    tokenizer.fit_on_texts(text)
    # Transforms text to a sequence of integers
    X = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    X = pad_sequences(X, padding='post', maxlen=max_len)
    # return sequences
    return X, tokenizer

print('Before Tokenization & Padding \n', data_test['text'][0])
X, tokenizer = tokenize_pad_sequences(data_test['text'])
print('After Tokenization & Padding \n', X[0])

In [None]:
y = pd.get_dummies(data_test['sentiment'])
train_data, test_data, train_label, test_label = train_test_split(X, y, test_size=0.10, random_state=1)

print('Train Set ->', train_data.shape, train_label.shape)
print('Test Set ->', test_data.shape, test_label.shape)

In [None]:
vocab_size = tokenizer.document_count

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 16),
    tf.keras.layers.LSTM(16, activation="relu"),
    tf.keras.layers.Dense(3, activation="softmax")
])

model.summary()

In [None]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["acc"])
history = model.fit(train_data, train_label, epochs=10, validation_data=(test_data, test_label))

In [None]:
def plot_graphs(history, string, num):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.text(0.5, -0.1, 'Figure ' + str(num) + ': ' + string, size=12, ha='center', transform=plt.gcf().transFigure)
    plt.show()

plot_graphs(history, "acc", 3)
plot_graphs(history, "loss", 4)


In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(train_data, train_label)
rf.score(test_data, test_label)

In [None]:
import joblib
# Assuming train_texts contains your training text data and train_labels contains your training labels
train_texts = data['text']
train_labels = data['sentiment']

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

# During training
tfidf_vectorizer = TfidfVectorizer(max_features=max_words)
tfidf_features_train = tfidf_vectorizer.fit_transform(train_texts)

# Train RandomForestClassifier using tfidf_features_train and train_labels
rf = RandomForestClassifier(n_estimators=100)
rf.fit(tfidf_features_train, train_labels)

# Save the TF-IDF vectorizer and RandomForestClassifier for later use
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(rf, 'random_forest_model.pkl')

# During prediction, load the TF-IDF vectorizer and RandomForestClassifier
# Preprocess the text and predict as shown in the previous responses


In [None]:
def predict_sentiment_rf(text):
    # Preprocess the text
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|#[a-zA-Z]+|$[a-zA-Z]+|@[a-zA-Z]+|[,.^_$*%-;鶯!?:]', '', text)
    text = text.lower().split()
    text = [word for word in text if word not in stopwords.words('english')]
    text = ' '.join(text)

    # Transform text into TF-IDF features using the loaded vectorizer
    tfidf_features = tfidf_vectorizer.transform([text])

    # Predict probabilities
    pred_prob = rf.predict_proba(tfidf_features)

    # Get the index of the class with the highest probability
    label_index = pred_prob.argmax(axis=1)[0]

    labels = ['negative', 'neutral', 'positive']
    return labels[label_index]

# Example of predicting a new text
new_text = "Thank u !! @apple for the best product!"

# Use the loaded TF-IDF vectorizer and RandomForestClassifier
print(predict_sentiment_rf(new_text))
# Example of predicting a new text
new_text = "well your product doesn't work @apple"

# Use the loaded TF-IDF vectorizer and RandomForestClassifier
print(predict_sentiment_rf(new_text))
# Example of predicting a new text
new_text = "@apple work on your products"

# Use the loaded TF-IDF vectorizer and RandomForestClassifier
print(predict_sentiment_rf(new_text))
