In [55]:
# Import libraries
import re
import nltk
import string
import pyforest
import pickle 
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

In [56]:
# Read the CSV file
columns  = ["sentiment", "ID", "datetime", "query", "username", "text"]
df = pd.read_csv('sentiment_dataset.csv', delimiter=',', encoding = "ISO-8859-1", names = columns)
df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,sentiment,ID,datetime,query,username,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [57]:
# Extract sentiment and text column, they will be relevant in this analysis
df = df[['sentiment','text']]
df.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [58]:
# Fuction that performs cleaning using re library. 
# It removes uppercase, brackets, links, punctuation etc
def cleaning(a):
    a = str(a).lower()
    a = re.sub('\[.*?\]', '', a)
    a = re.sub('[%s]' % re.escape(string.punctuation), '', a)
    a = re.sub('\n', '', a)
    a = re.sub('https?://\S+|www\.\S+', '', a)
    a = re.sub('<.*?>+', '', a)
    a = re.sub('\w*\d\w*', '', a)
    return a

df['text'] = df['text'].apply(cleaning)
df.head()

Unnamed: 0,sentiment,text
0,0,switchfoot awww thats a bummer you shoulda ...
1,0,is upset that he cant update his facebook by t...
2,0,kenichan i dived many times for the ball manag...
3,0,my whole body feels itchy and like its on fire
4,0,nationwideclass no its not behaving at all im ...


In [59]:
# Checking for balance
df['sentiment'].value_counts()

0    800000
4    800000
Name: sentiment, dtype: int64

In [60]:
# Removing stop words i.e., the, is, and, or, in, this etc
s_words = stopwords.words('english')

def removing(text):
    a = ' '.join(i for i in text.split(' ') if i not in s_words)
    return a
    
df['text'] = df['text'].apply(removing)
df.head()

Unnamed: 0,sentiment,text
0,0,switchfoot awww thats bummer shoulda got da...
1,0,upset cant update facebook texting might cry r...
2,0,kenichan dived many times ball managed save ...
3,0,whole body feels itchy like fire
4,0,nationwideclass behaving im mad cant see


In [61]:
# Change 0 to negative and 4 to positive
class_dict = {0:'negative', 4:'positive'}
df['sentiment'] = df['sentiment'].apply(lambda x:  class_dict[x])
df.head()

Unnamed: 0,sentiment,text
0,negative,switchfoot awww thats bummer shoulda got da...
1,negative,upset cant update facebook texting might cry r...
2,negative,kenichan dived many times ball managed save ...
3,negative,whole body feels itchy like fire
4,negative,nationwideclass behaving im mad cant see


In [62]:
# Stemming words that have same meaning
stemmer = nltk.SnowballStemmer("english")

def stemming(text):
    text = ' '.join(stemmer.stem(i) for i in text.split(' '))
    return text

df['text'] = df['text'].apply(stemming)
df.head()

Unnamed: 0,sentiment,text
0,negative,switchfoot awww that bummer shoulda got dav...
1,negative,upset cant updat facebook text might cri resul...
2,negative,kenichan dive mani time ball manag save rest...
3,negative,whole bodi feel itchi like fire
4,negative,nationwideclass behav im mad cant see


In [63]:
df['text'] = df['text'].str.strip()
df.head()

Unnamed: 0,sentiment,text
0,negative,switchfoot awww that bummer shoulda got dav...
1,negative,upset cant updat facebook text might cri resul...
2,negative,kenichan dive mani time ball manag save rest...
3,negative,whole bodi feel itchi like fire
4,negative,nationwideclass behav im mad cant see


In [64]:
# Split the data to train data and test data
X = df['text']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [65]:
# Modeling
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [66]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.78081

In [67]:
print("F1 Score: ", f1_score(y_test, lr.predict(X_test), pos_label='positive'))
print('Accuracy: ', accuracy_score(y_test, lr.predict(X_test)), "\n")
print(classification_report(y_test, lr.predict(X_test)))
print('Confusion Matrix:\n', confusion_matrix(y_test, lr.predict(X_test)))

F1 Score:  0.7849148991497275
Accuracy:  0.78081 

              precision    recall  f1-score   support

    negative       0.79      0.76      0.78    199581
    positive       0.77      0.80      0.78    200419

    accuracy                           0.78    400000
   macro avg       0.78      0.78      0.78    400000
weighted avg       0.78      0.78      0.78    400000

Confusion Matrix:
 [[152345  47236]
 [ 40440 159979]]


In [68]:
predict = lambda x: lr.predict(vectorizer.transform([x]))
sentiment  = lambda x: 'positive' if predict(x) == 'positive' else 'negative'
tweet = "I like this movie"
sentiment(tweet)

'positive'

In [70]:
vectorizer = TfidfVectorizer()
coefficients = pd.DataFrame(lr.coef_, columns=vectorizer.get_feature_names()).T
coefficients.columns = ['coefficients']
coefficients['abs'] = coefficients['coefficients'].apply(lambda x: abs(x))
coefficients.sort_values(by='abs', ascending=False).head(10)

<IPython.core.display.Javascript object>

AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names'

In [None]:
plt.figure(figsize=(10, 10))
sns.barplot(x='coefficients', y=coefficients.sort_values(by='abs', ascending=False).head(10).index, data=coefficients.sort_values(by='abs', ascending=False).head(10))
plt.title('Top 10 Positive and Negative Words')
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
sns.barplot(x='coefficients', y=coefficients.sort_values(by='abs', ascending=False).tail(10).index, data=coefficients.sort_values(by='abs', ascending=False).tail(10))
plt.title('Top 10 Positive and Negative Words')
plt.show()