In [1]:
import pandas as pd
import re

import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Data Explore

In [21]:
# reading data
df = pd.read_csv('tweets.csv', header=None, usecols=[0,5])
df.columns = ['sentiment', 'text']

[0 4]


In [None]:
# temporary choosing 10000 tweets to speed up crunch
df = df.sample(10000)
df.head()

In [20]:
# replacing 0 and 4 with -1 and +1
df['sentiment'] = df['sentiment'].apply(lambda sentiment : 1 if sentiment == 4 else -1)

[0 4]


# Data crunch

In [4]:
def remove_punctuation(text):
    final = "".join(u for u in text if u not in ("?", ".", ";", ":", "!",'"', '-'))
    return final

def remove_usertag(text):
    return re.sub('@\w+', '', text)

def remove_links(text):
    return re.sub('(http|https)([^\s]+)', '', text)

def remove_stopwords(text):
    # Reeeeally slows down the script
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    filtered_tweet = (" ").join(tokens_without_sw)
    return filtered_tweet

def trim_text(text):
    no_punctuation = remove_punctuation(text)
    no_usertag = remove_usertag(no_punctuation)
    final = remove_links(no_usertag)
    # no_stops = remove_stopwords(no_links)
    return final

df['text'] = df['text'].apply(trim_text)
df.head(5)

Unnamed: 0,sentiment,text
0,-1,"Awww, that's a bummer You shoulda got Davi..."
1,-1,is upset that he can't update his Facebook by ...
2,-1,I dived many times for the ball Managed to sa...
3,-1,my whole body feels itchy and like its on fire
4,-1,"no, it's not behaving at all i'm mad why am i..."


# Data visualization

In [None]:
df['sentiment_plot'] = df['sentiment'].replace({-1 : 'negative'})
df['sentiment_plot'] = df['sentiment_plot'].replace({1 : 'positive'})

fig = px.histogram(df, x="sentiment_plot")
fig.update_traces(marker_color="indianred",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Tweet Sentiment')
fig.show()

# 

# Creating model

In [5]:
# Train and test data
df['random_number'] = np.random.randn(len(df.index))

train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]

In [6]:
# count vectorizer (bag of words):
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['text'])
test_matrix = vectorizer.transform(test['text'])

In [8]:
model = LogisticRegression(max_iter=100)

X_train = train_matrix
X_test = test_matrix
y_train = train['sentiment']
y_test = test['sentiment']

model.fit(X_train,y_train)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



LogisticRegression()

In [9]:
predictions = model.predict(X_test)

confusion_matrix(predictions, y_test)

array([[131646,  31407],
       [ 37886, 138113]])

In [10]:
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

          -1       0.78      0.81      0.79    163053
           1       0.81      0.78      0.80    175999

    accuracy                           0.80    339052
   macro avg       0.80      0.80      0.80    339052
weighted avg       0.80      0.80      0.80    339052

