In [None]:
import pandas as pd
import re

import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Data Explore

In [None]:
# reading data
df = pd.read_csv('tweets.csv', header=None, usecols=[0,5])
df.columns = ['sentiment', 'text']

In [None]:
# temporary choosing 10000 tweets to speed up crunch
df = df.sample(1000)
df.head()

In [None]:
# replacing 0 and 4 with -1 and +1
df['sentiment'] = df['sentiment'].apply(lambda sentiment : 1 if sentiment == 4 else -1)

# Data crunch

In [None]:
def remove_punctuation(text):
    final = "".join(u for u in text if u not in ("?", ".", ";", ":", "!",'"', '-'))
    return final

def remove_usertag(text):
    return re.sub('@\w+', '', text)

def remove_links(text):
    return re.sub('(http|https)([^\s]+)', '', text)

def remove_stopwords(text):
    # Reeeeally slows down the script
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    filtered_tweet = (" ").join(tokens_without_sw)
    return filtered_tweet

def trim_text(text):
    no_punctuation = remove_punctuation(text)
    no_usertag = remove_usertag(no_punctuation)
    no_links = remove_links(no_usertag)
    no_stops = remove_stopwords(no_links)
    return no_stops

df['text'] = df['text'].apply(trim_text)
df.head(5)

# Data visualization

In [None]:
df['sentiment_plot'] = df['sentiment'].replace({-1 : 'negative'})
df['sentiment_plot'] = df['sentiment_plot'].replace({1 : 'positive'})

fig = px.histogram(df, x="sentiment_plot")
fig.update_traces(marker_color="indianred",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Tweet Sentiment')
fig.show()

# 

# Creating model

In [None]:
# Train and test data
df['random_number'] = np.random.randn(len(df.index))

train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]

In [None]:
# count vectorizer (bag of words):
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['text'])
test_matrix = vectorizer.transform(test['text'])

In [None]:
model = LogisticRegression(max_iter=100)

X_train = train_matrix
X_test = test_matrix
y_train = train['sentiment']
y_test = test['sentiment']

model.fit(X_train,y_train)

In [None]:
predictions = model.predict(X_test)

confusion_matrix(predictions, y_test)

In [None]:
print(classification_report(predictions,y_test))