In [1]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import pandas as pd  
import numpy as np

In [2]:
df = pd.read_csv('cleaned_data.csv', index_col=0)
df.head()

  mask |= (ar1 == a)


Unnamed: 0,text,sentiment
0,awww that bummer you shoulda got david carr of...,0
1,is upset that he can not update his facebook b...,0
2,dived many times for the ball managed to save ...,0
3,my whole body feels itchy and like its on fire,0
4,no it not behaving at all mad why am here beca...,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1596522 entries, 0 to 1599999
Data columns (total 2 columns):
text         1596522 non-null object
sentiment    1596522 non-null int64
dtypes: int64(1), object(1)
memory usage: 36.5+ MB


# Training Logistic Regression and predicting tweets polarity

In [4]:
x = df.text
y = df.sentiment

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.5)

In [5]:
log_reg = LogisticRegression()
vectorizer = TfidfVectorizer()

vectorizer.set_params(stop_words=None, max_features=100000, ngram_range=(1, 3))
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', log_reg)
])
sentiment_fit = pipeline.fit(x_train, y_train)
y_pred = sentiment_fit.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: {0:.2f}\n".format(accuracy))



Accuracy score: 0.82



In [47]:
# testing prediction

tweets = ['i love you', 'hate this day', 'i can not do this', 'this is the best chololate cake']
predict = sentiment_fit.predict(tweets)
predict

array([1, 0, 0, 1], dtype=int64)

## Tweets preparing

In [7]:
tweets_df = pd.read_csv('received_tweets.csv', index_col=0)
tweets_df.head()

Unnamed: 0,tweet,coordinates,location,language
0,"Canada’s @PaulMGrod, President of @UWCongress,...",,Ukraine,en
1,RT @expatua: Epifaniy: Russian Orthodox Church...,,"Kyiv, Ukraine",en
2,RT @Reuters: Ecumenical Patriarch signs decree...,,Київ,en
3,RT @TarasKuzio: Putin’s defeat as Ukraine rece...,,愛知 尾張旭市,tr
4,Ukraine: new Orthodox church gains independenc...,,"Dublin, Ireland.",en


In [14]:
from tweet_preprocessing import PreprocessTweet

def clean_tweet(tweet):
    t = PreprocessTweet()
    bs_tweet = t.decode_HTML(tweet)
    bom_tweet = t.remove_BOM(bs_tweet)
    tweet = t.remove_mentions(bom_tweet)
    tweet = t.remove_links(tweet)
    tweet_lowercase = t.to_lower(tweet)
    without_negations = t.hadle_negations(tweet_lowercase)
    tweet = t.remove_nonletter_characters(without_negations)
    return t.tokenize(tweet)

In [13]:
#testing clean_tweet function

clean_tweet('@BBCWorld: Ukraine Orthodox Church to be granted independence 77 from Russian Church https://t.co/1qIw8PW7ic')

'ukraine orthodox church to be granted independence from russian church'

In [27]:
en_tweets = tweets_df[tweets_df.language=='en']
en_tweets.head(10)

Unnamed: 0,tweet,coordinates,location,language
0,"Canada’s @PaulMGrod, President of @UWCongress,...",,Ukraine,en
1,RT @expatua: Epifaniy: Russian Orthodox Church...,,"Kyiv, Ukraine",en
2,RT @Reuters: Ecumenical Patriarch signs decree...,,Київ,en
4,Ukraine: new Orthodox church gains independenc...,,"Dublin, Ireland.",en
5,RT @BBCWorld: Ukraine Orthodox Church to be gr...,,"Brooklyn, NY",en
6,RT @AP: The Ecumenical Patriarch of Constantin...,,"Buffalo, New York",en
7,Tomos signing ceremony in Constantinople ends ...,,"Richmond Hill, Ontario, Canada",en
10,RT @BungeeWedgie: #Ukraine post office celebra...,,12358 Fibonacci Cir.,en
11,RT @DECLEX: 2/3rd point had me in tears. 😂 htt...,,"Lokoja, Nigeria",en
13,RT @MrDtAFC: @BBCMOTD BATE Borisov v Arsenal -...,,Uswazi,en


In [33]:
en_tweets['cleaned_tweet'] = en_tweets['tweet'].apply(clean_tweet)

  ' Beautiful Soup.' % markup)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [36]:
predictions = sentiment_fit.predict(en_tweets.cleaned_tweet)
predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 1], dtype=int64)

In [38]:
en_tweets['prediction'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [40]:
en_tweets.head()

Unnamed: 0,tweet,coordinates,location,language,cleaned_tweet,prediction
0,"Canada’s @PaulMGrod, President of @UWCongress,...",,Ukraine,en,canada president of joins metropolitan in ista...,1
1,RT @expatua: Epifaniy: Russian Orthodox Church...,,"Kyiv, Ukraine",en,rt epifaniy russian orthodox church to continu...,1
2,RT @Reuters: Ecumenical Patriarch signs decree...,,Київ,en,rt ecumenical patriarch signs decree granting ...,1
4,Ukraine: new Orthodox church gains independenc...,,"Dublin, Ireland.",en,ukraine new orthodox church gains independence...,1
5,RT @BBCWorld: Ukraine Orthodox Church to be gr...,,"Brooklyn, NY",en,rt ukraine orthodox church to be granted indep...,1


In [45]:
en_tweets['prediction'].value_counts()

1    143
0     21
Name: prediction, dtype: int64

As 1 represents positive and 0 - negative, (and if we agree to trust the model with 0.82 accuracy score) we can see that people mostly tweet positive things about Ukraine :)