In [None]:
# Import libraries
import re
import nltk
import string
import pyforest
import pickle 
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

In [None]:
# Read the CSV file
columns  = ["sentiment", "ID", "datetime", "query", "username", "text"]
df = pd.read_csv('sentiment_dataset.csv', delimiter=',', encoding = "ISO-8859-1", names = columns)
df.head()

In [None]:
# Extract sentiment and text column, they will be relevant in this analysis
df = df[['sentiment','text']]
df.head()

In [None]:
# Fuction that performs cleaning using re library. 
# It removes uppercase, brackets, links, punctuation etc
def cleaning(a):
    a = str(a).lower()
    a = re.sub('\[.*?\]', '', a)
    a = re.sub('[%s]' % re.escape(string.punctuation), '', a)
    a = re.sub('\n', '', a)
    a = re.sub('https?://\S+|www\.\S+', '', a)
    a = re.sub('<.*?>+', '', a)
    a = re.sub('\w*\d\w*', '', a)
    return a

df['text'] = df['text'].apply(cleaning)
df.head()

In [None]:
# Checking for balance
df['sentiment'].value_counts()

In [None]:
# Removing stop words i.e., the, is, and, or, in, this etc
s_words = stopwords.words('english')

def removing(text):
    a = ' '.join(i for i in text.split(' ') if i not in s_words)
    return a
    
df['text'] = df['text'].apply(removing)
df.head()

In [None]:
# Change 0 to negative and 4 to positive
class_dict = {0:'negative', 4:'positive'}
df['sentiment'] = df['sentiment'].apply(lambda x:  class_dict[x])
df.head()

In [None]:
# Stemming words that have same meaning
stemmer = nltk.SnowballStemmer("english")

def stemming(text):
    text = ' '.join(stemmer.stem(i) for i in text.split(' '))
    return text

df['text'] = df['text'].apply(stemming)
df.head()

In [None]:
df['text'] = df['text'].str.strip()
df.head()

In [None]:
# Split the data to train data and test data
X = df['text']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Modeling
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
print("F1 Score: ", f1_score(y_test, lr.predict(X_test), pos_label='positive'))
print('Accuracy: ', accuracy_score(y_test, lr.predict(X_test)), "\n")
print(classification_report(y_test, lr.predict(X_test)))
print('Confusion Matrix:\n', confusion_matrix(y_test, lr.predict(X_test)))

In [None]:
coefficients = pd.DataFrame(lr.coef_, columns=vectorizer.get_feature_names()).T
coefficients.columns = ['coefficients']
coefficients['abs'] = coefficients['coefficients'].apply(lambda x: abs(x))
coefficients.sort_values(by='abs', ascending=False).head(10)

In [None]:
predict = lambda x: lr.predict(vectorizer.transform([x]))
sentiment  = lambda x: 'positive' if predict(x) == 'positive' else 'negative'
tweet = "I like this movie"
sentiment(tweet)

In [None]:
plt.figure(figsize=(10, 10))
sns.barplot(x='coefficients', y=coefficients.sort_values(by='abs', ascending=False).head(10).index, data=coefficients.sort_values(by='abs', ascending=False).head(10))
plt.title('Top 10 Positive and Negative Words')
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
sns.barplot(x='coefficients', y=coefficients.sort_values(by='abs', ascending=False).tail(10).index, data=coefficients.sort_values(by='abs', ascending=False).tail(10))
plt.title('Top 10 Positive and Negative Words')
plt.show()