In [1]:
#importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
path = r'Data.csv'
df = pd.read_csv(path, encoding = 'ISO-8859-1')

In [3]:
df.shape

(4101, 27)

In [4]:
len(df.columns)

27

In [5]:
df.head(2)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,Scorecard,Hughes' instant hit buoys Blues,Jack gets his skates on at ice-cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,...,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl's successor drawn into scandal,The difference between men and women,"Sara Denver, nurse turned solicitor",Diana's landmine crusade put Tories in a panic,Yeltsin's resignation caught opposition flat-f...,Russian roulette,Sold out,Recovering a title
1,2000-01-04,0,Scorecard,The best lake scene,Leader: German sleaze inquiry,"Cheerio, boyo",The main recommendations,Has Cubie killed fees?,Has Cubie killed fees?,Has Cubie killed fees?,...,On the critical list,The timing of their lives,Dear doctor,Irish court halts IRA man's extradition to Nor...,Burundi peace initiative fades after rebels re...,PE points the way forward to the ECB,Campaigners keep up pressure on Nazi war crime...,Jane Ratcliffe,Yet more things you wouldn't know without the ...,Millennium bug fails to bite


In [6]:
df.Label.value_counts(dropna = False, sort = False)

0    1935
1    2166
Name: Label, dtype: int64

In [7]:
# Finding any NaN values
df.isna().sum().sort_values(ascending=False)

Top25    3
Top24    3
Top23    1
Top13    0
Top22    0
Top21    0
Top20    0
Top19    0
Top18    0
Top17    0
Top16    0
Top15    0
Top14    0
Date     0
Label    0
Top11    0
Top10    0
Top9     0
Top8     0
Top7     0
Top6     0
Top5     0
Top4     0
Top3     0
Top2     0
Top1     0
Top12    0
dtype: int64

In [8]:
# Dropping NaN values
df.dropna(inplace=True)
print(df.shape)

(4098, 27)


In [9]:
df_copy = df.copy()
df_copy.reset_index(inplace=True)

In [10]:
# Splitting the dataset into train an test set
y = df_copy.Label
X = df_copy.drop(columns = "Label")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle = False, random_state = 42)
print(f'Train size: {X_train.shape}, Test size: {X_test.shape}')

Train size: (3688, 27), Test size: (410, 27)


In [11]:
X_train.drop(columns = ["index", "Date"], inplace = True)
X_test.drop(columns = ["index", "Date"], inplace = True)

In [12]:
# Removing punctuation and special character from the text
X_train.replace(to_replace='[^a-zA-Z]+', value=' ', regex=True, inplace=True)
X_test.replace(to_replace='[^a-zA-Z]+', value=' ', regex=True, inplace=True)

In [13]:
#renaming columns
new_columns = [str(i) for i in range(0,25)]
X_train.columns = new_columns
X_test.columns = new_columns

In [14]:
#lowering the case
for i in new_columns:
    X_train[i] = X_train[i].str.lower()
    X_test[i] = X_test[i].str.lower()

In [15]:
# Joining all the columns
train_headlines = []
test_headlines = []

for row in range(0, X_train.shape[0]):
    train_headlines.append(' '.join(str(x) for x in X_train.iloc[row, 0:25]))

for row in range(0, X_test.shape[0]):
    test_headlines.append(' '.join(str(x) for x in X_test.iloc[row, 0:25]))

In [16]:
train_headlines[0]

'a hindrance to operations extracts from the leaked reports scorecard hughes instant hit buoys blues jack gets his skates on at ice cold alex chaos as maracana builds up for united depleted leicester prevail as elliott spoils everton s party hungry spurs sense rich pickings gunners so wide of an easy target derby raise a glass to strupar s debut double southgate strikes leeds pay the penalty hammers hand robson a youthful lesson saints party like it s  wear wolves have turned into lambs stump mike catches testy gough s taunt langer escapes to hit  flintoff injury piles on woe for england hunters threaten jospin with new battle of the somme kohl s successor drawn into scandal the difference between men and women sara denver nurse turned solicitor diana s landmine crusade put tories in a panic yeltsin s resignation caught opposition flat footed russian roulette sold out recovering a title'

In [17]:
test_headlines[0]

'sweden has proof foreign sub entered its waters report local hunters kill over boko haram insurgents recapture adamawa town isis releases a new speech of its leader abu bakr al baghdadi in which he urges his supporters to attack saudi arabia  russia says fleet of warships is off australia for climate research unique year old circular pyramid discovered in bolivia the israeli government has announced that it is denying entry to members of the un human rights council commission charged with investigating war crimes committed during the summer israeli invasion of the gaza strip  supreme court of canada ruling makes honesty the law for businesses the first jihadist to be tried in france after returning from syria has been given a seven year jail sentence flavien moreau travelled to syria and joined an islamist militant group but says he only stayed for a dozen days because he missed smoking banned by the jihadist group he joined  paedophiles have created a deep web version of kickstarter 

In [18]:
# Creating corpus of train dataset
ps = PorterStemmer()
train_corpus = []

for i in range(0, len(train_headlines)):
  # Tokenizing the news-title by words
  words = train_headlines[i].split()

  # Removing the stopwords
  words = [word for word in words if word not in set(stopwords.words('english'))]

  # Stemming the words
  words = [ps.stem(word) for word in words]

  # Joining the stemmed words
  headline = ' '.join(words)

  # Building a corpus of news-title
  train_corpus.append(headline)

In [19]:
# Creating corpus of test dataset
test_corpus = []

for i in range(0, len(test_headlines)):

  # Tokenizing the news-title by words
  words = test_headlines[i].split()

  # Removing the stopwords
  words = [word for word in words if word not in set(stopwords.words('english'))]

  # Stemming the words
  words = [ps.stem(word) for word in words]

  # Joining the stemmed words
  headline = ' '.join(words)

  # Building a corpus of news-title
  test_corpus.append(headline)

In [23]:
train_corpus[0:10]

['hindranc oper extract leak report scorecard hugh instant hit buoy blue jack get skate ice cold alex chao maracana build unit deplet leicest prevail elliott spoil everton parti hungri spur sens rich pick gunner wide easi target derbi rais glass strupar debut doubl southgat strike leed pay penalti hammer hand robson youth lesson saint parti like wear wolv turn lamb stump mike catch testi gough taunt langer escap hit flintoff injuri pile woe england hunter threaten jospin new battl somm kohl successor drawn scandal differ men women sara denver nurs turn solicitor diana landmin crusad put tori panic yeltsin resign caught opposit flat foot russian roulett sold recov titl',
 'scorecard best lake scene leader german sleaz inquiri cheerio boyo main recommend cubi kill fee cubi kill fee cubi kill fee hopkin furiou foster lack hannib appetit cubi kill fee tale two tail say like like say elbow eye nippl task forc assess risk asteroid collis found last critic list time live dear doctor irish cou

In [24]:
test_corpus[0:10]

['sweden proof foreign sub enter water report local hunter kill boko haram insurg recaptur adamawa town isi releas new speech leader abu bakr al baghdadi urg support attack saudi arabia russia say fleet warship australia climat research uniqu year old circular pyramid discov bolivia isra govern announc deni entri member un human right council commiss charg investig war crime commit summer isra invas gaza strip suprem court canada rule make honesti law busi first jihadist tri franc return syria given seven year jail sentenc flavien moreau travel syria join islamist milit group say stay dozen day miss smoke ban jihadist group join paedophil creat deep web version kickstart crowdfund child porn killer robot strictli monitor nation demand un countri warn potenti danger autonom weapon system say risk violat intern humanitarian law toni abbot say australia noth bush white colonis speech given intern busi breakfast russia may claim damag franc deliv mistral warship news agenc franc call israe

In [20]:
# down_words = []
# for i in list(y_train[y_train==0].index):
#     down_words.append(train_corpus[i])

# up_words = []
# for i in list(y_train[y_train==1].index):
#     up_words.append(train_corpus[i])

In [21]:
# # Creating wordcloud for down_words
# from wordcloud import WordCloud
# wordcloud1 = WordCloud(background_color='white', width=3000, height=2500).generate(down_words[1])
# plt.figure(figsize=(8,8))
# plt.imshow(wordcloud1)
# plt.axis('off')
# plt.title("Words which indicate a fall in DJIA ")
# plt.show()

In [22]:
# # Creating wordcloud for up_words
# wordcloud2 = WordCloud(background_color='white', width=3000, height=2500).generate(up_words[5])
# plt.figure(figsize=(8,8))
# plt.imshow(wordcloud2)
# plt.axis('off')
# plt.title("Words which indicate a rise in DJIA ")
# plt.show()

In [25]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000, ngram_range=(2,2))
X_train1 = cv.fit_transform(train_corpus).toarray()

In [26]:
X_test1 = cv.transform(test_corpus).toarray()

In [27]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train1, y_train)

In [28]:
lr_y_pred = lr_classifier.predict(X_test1)

In [33]:
# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score, precision_score, recall_score
score1 = accuracy_score(y_test, lr_y_pred)
score2 = precision_score(y_test, lr_y_pred)
score3 = recall_score(y_test, lr_y_pred)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score1*100,2)))
print("Precision score is: {}".format(round(score2*100,2)))
print("Recall score is: {}".format(round(score3*100,2)))

---- Scores ----
Accuracy score is: 51.22%
Precision score is: 52.05
Recall score is: 60.48


In [37]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
lr_cm = confusion_matrix(y_test, lr_y_pred)
lr_cm

array([[ 83, 117],
       [ 83, 127]], dtype=int64)

In [39]:
#printing classifcation report
from sklearn.metrics import classification_report
print(classification_report(y_test, lr_y_pred))

              precision    recall  f1-score   support

           0       0.50      0.41      0.45       200
           1       0.52      0.60      0.56       210

    accuracy                           0.51       410
   macro avg       0.51      0.51      0.51       410
weighted avg       0.51      0.51      0.51       410

