In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('news.csv')

In [None]:
data.head()

Unnamed: 0,publish_date,headline
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [None]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sentiment = SentimentIntensityAnalyzer()

In [None]:
def vadar_sentiment(text):
    """ Calculate and return the nltk vadar (lexicon method) sentiment """
    return sentiment.polarity_scores(text)['compound']

# create new column for vadar compound sentiment score
data['vadar compound'] = data['headline'].apply(vadar_sentiment)

def categorise_sentiment(sentiment, neg_threshold=-0.05, pos_threshold=0.05):
    """ categorise the sentiment value as positive (1), negative (-1)
        or neutral (0) based on given thresholds """
    if sentiment < neg_threshold:
        label = 'negative'
    elif sentiment > pos_threshold:
        label = 'positive'
    else:
        label = 'neutral'
    return label

# new col with vadar sentiment label based on vadar compound score
data['sentiment'] = data['vadar compound'].apply(categorise_sentiment)

In [None]:
data.head()

Unnamed: 0,publish_date,headline,vadar compound,sentiment
0,20030219,aba decides against community broadcasting lic...,0.0,neutral
1,20030219,act fire witnesses must be aware of defamation,-0.34,negative
2,20030219,a g calls for infrastructure protection summit,0.0,neutral
3,20030219,air nz staff in aust strike for pay rise,-0.2263,negative
4,20030219,air nz strike to affect australian travellers,-0.128,negative


In [None]:
#Feature Generation Using Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

In [None]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True, stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(data['headline'])

In [None]:
text_counts.shape

(1048575, 90781)

In [None]:
#splitting in training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(text_counts, data['sentiment'], test_size = 0.3, random_state=29)

In [None]:
#model Building
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [None]:
#predictions
pred = dt.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

    negative       0.92      0.94      0.93    107686
     neutral       0.97      0.95      0.96    142325
    positive       0.91      0.92      0.92     64562

    accuracy                           0.94    314573
   macro avg       0.93      0.93      0.93    314573
weighted avg       0.94      0.94      0.94    314573

