In [None]:
import pandas as pd


sentiment140_df = pd.read_csv('sentiment140.csv', encoding='latin-1', names=['target', 'ids', 'date', 'flag', 'user', 'text'])


sentiment140_df = sentiment140_df[['target', 'text']]


positive_tweets = sentiment140_df[sentiment140_df['target'] == 4].sample(n=15000, random_state=42)
negative_tweets = sentiment140_df[sentiment140_df['target'] == 0].sample(n=15000, random_state=42)


balanced_sentiment140_df = pd.concat([positive_tweets, negative_tweets])

balanced_sentiment140_df = balanced_sentiment140_df.sample(frac=1, random_state=42)


balanced_sentiment140_df.to_csv('sentiment45k.csv', index=False)

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


sentiment140_df = pd.read_csv('sentiment45k.csv', encoding='latin-1', names=['target', 'text'])

sentiment140_df = sentiment140_df[['target', 'text']]


cleaned_tweets_df = pd.read_excel('cleaned_tweets.xlsx')


X_train, X_test, y_train, y_test = train_test_split(sentiment140_df['text'], sentiment140_df['target'], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

model = SVC(kernel='linear')
model.fit(X_train_vectorized, y_train)

y_pred = model.predict(X_test_vectorized)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

cleaned_tweets_vectorized = vectorizer.transform(cleaned_tweets_df['Tweet'])
predicted_sentiment = model.predict(cleaned_tweets_vectorized)
cleaned_tweets_df['predicted_sentiment'] = predicted_sentiment


cleaned_tweets_df.to_excel('cleaned_tweets_with_sentiment_predictions.xlsx', index=False)


Accuracy: 0.7617063822696217
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      2989
           4       0.77      0.75      0.76      3012

    accuracy                           0.76      6001
   macro avg       0.76      0.76      0.76      6001
weighted avg       0.76      0.76      0.76      6001



In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_extraction.text import TfidfVectorizer

sentiment140_df = pd.read_csv('sentiment45k.csv', encoding='latin-1', names=['target', 'text'])
sentiment140_df = sentiment140_df[['target', 'text']]
tweets_df = pd.read_excel('sorted_tweets.xlsx')
X_train, X_test, y_train, y_test = train_test_split(sentiment140_df['text'], sentiment140_df['target'], test_size=0.2, random_state=42)
# Vectorize 
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_vectorized, y_train)
nb_y_pred = nb_model.predict(X_test_vectorized)
nb_accuracy = accuracy_score(y_test, nb_y_pred)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Classification Report:")
print(classification_report(y_test, nb_y_pred))

#SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_vectorized, y_train)
y_pred = svm_model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print("SVM Accuracy:", accuracy)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred))

#  Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_vectorized, y_train)
lr_y_pred = lr_model.predict(X_test_vectorized)
lr_accuracy = accuracy_score(y_test, lr_y_pred)
print("Logistic Regression Accuracy:", lr_accuracy)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_y_pred))

#  Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vectorized, y_train)
rf_y_pred = rf_model.predict(X_test_vectorized)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_y_pred))

#  Gradient Boosting Machine
gbm_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbm_model.fit(X_train_vectorized, y_train)
gbm_y_pred = gbm_model.predict(X_test_vectorized)
gbm_accuracy = accuracy_score(y_test, gbm_y_pred)
print("Gradient Boosting Machine Accuracy:", gbm_accuracy)
print("Gradient Boosting Machine Classification Report:")
print(classification_report(y_test, gbm_y_pred))

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_vectorized, y_train)
dt_y_pred = dt_model.predict(X_test_vectorized)
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print("Decision Tree Accuracy:", dt_accuracy)
print("Decision Tree Classification Report:")
print(classification_report(y_test, dt_y_pred))
# Predict sentiment for cleaned tweets
cleaned_tweets_vectorized = vectorizer.transform(cleaned_tweets_df['Tweet'])

# Naive Bayes
nb_predicted_sentiment = nb_model.predict(cleaned_tweets_vectorized)
cleaned_tweets_df['nb_predicted_sentiment'] = nb_predicted_sentiment

#SVM
svm_predicted_sentiment = svm_model.predict(cleaned_tweets_vectorized)
cleaned_tweets_df['svm_predicted_sentiment'] = svm_predicted_sentiment

# Logistic Regression
lr_predicted_sentiment = lr_model.predict(cleaned_tweets_vectorized)
cleaned_tweets_df['lr_predicted_sentiment'] = lr_predicted_sentiment

# Random Forest
rf_predicted_sentiment = rf_model.predict(cleaned_tweets_vectorized)
cleaned_tweets_df['rf_predicted_sentiment'] = rf_predicted_sentiment

# GBM
gbm_predicted_sentiment = gbm_model.predict(cleaned_tweets_vectorized)
cleaned_tweets_df['gbm_predicted_sentiment'] = gbm_predicted_sentiment

#DT
dt_predicted_sentiment = dt_model.predict(cleaned_tweets_vectorized)
cleaned_tweets_df['dt_predicted_sentiment'] = dt_predicted_sentiment

cleaned_tweets_df.to_excel('tweets_with_predictions.xlsx', index=False)


Naive Bayes Accuracy: 0.7430428261956341
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.83      0.76      2989
           4       0.79      0.66      0.72      3012

    accuracy                           0.74      6001
   macro avg       0.75      0.74      0.74      6001
weighted avg       0.75      0.74      0.74      6001

SVM Accuracy: 0.7617063822696217
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      2989
           4       0.77      0.75      0.76      3012

    accuracy                           0.76      6001
   macro avg       0.76      0.76      0.76      6001
weighted avg       0.76      0.76      0.76      6001

Logistic Regression Accuracy: 0.7632061323112814
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.77      0.76      2989
         

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

sentiment_df = pd.read_csv('sentiment45k.csv', encoding='latin-1', names=['target', 'text'])
sentiment_df['target'] = sentiment_df['target']
sentiment_df.dropna(subset=['text'], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(sentiment_df['text'], sentiment_df['target'], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

sentiment_df.dropna(subset=['text'], inplace=True)

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_vectorized, y_train)
lr_y_pred = lr_model.predict(X_test_vectorized)
lr_accuracy = accuracy_score(y_test, lr_y_pred)
print("Logistic Regression Accuracy:", lr_accuracy)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_y_pred))

tweets_df = pd.read_excel('sentiment_data_with_regions.xlsx')

tweets_df.dropna(subset=['Tweet'], inplace=True)

cleaned_tweets_vectorized = vectorizer.transform(tweets_df['Tweet'])

predicted_sentiment = lr_model.predict(cleaned_tweets_vectorized)
tweets_df['predicted_sentiment'] = predicted_sentiment

tweets_df.to_excel('sentiment_data_with_predicted_sentiment.xlsx', index=False)


Logistic Regression Accuracy: 0.7617063822696217
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.77      0.76      2989
           4       0.77      0.76      0.76      3012

    accuracy                           0.76      6001
   macro avg       0.76      0.76      0.76      6001
weighted avg       0.76      0.76      0.76      6001



In [11]:
import pandas as pd


tweets_df = pd.read_excel('sentiment_data_with_predicted_sentiment.xlsx')

tweets_df['predicted_sentiment'] = tweets_df['predicted_sentiment'].map({0: 'negative', 4: 'positive'})

tweets_df.to_excel('sentiment_data_with_predicted_sentiment.xlsx', index=False)


In [4]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, classification_report


nltk.download('vader_lexicon')


df = pd.read_csv('sentiment45k.csv')

sia = SentimentIntensityAnalyzer()


df['vader_score'] = df['text'].apply(lambda x: sia.polarity_scores(x)['compound'])


df['vader_sentiment'] = df['vader_score'].apply(lambda x: 4 if x >= 0 else 0)


accuracy = accuracy_score(df['target'], df['vader_sentiment'])
print(f"Accuracy of VADER: {accuracy}")


report = classification_report(df['target'], df['vader_sentiment'], target_names=['Negative', 'Positive'])
print("Classification Report for VADER Sentiment Analysis:\n", report)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sweth\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Accuracy of VADER: 0.6586
Classification Report for VADER Sentiment Analysis:
               precision    recall  f1-score   support

    Negative       0.80      0.42      0.55     15000
    Positive       0.61      0.90      0.72     15000

    accuracy                           0.66     30000
   macro avg       0.70      0.66      0.64     30000
weighted avg       0.70      0.66      0.64     30000

