In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
!pip install textblob
import textblob



In [91]:
stock_df = pd.read_csv(r'stock-headlines.csv')
stock_df

Unnamed: 0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4843,negative,Net sales of the Paper segment decreased to EU...


In [92]:
stock_df.rename(columns={"neutral":"Sentiment", "According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .":"Headline"}, inplace=True)

In [93]:
df = stock_df.drop(stock_df.tail(4500).index)
df

Unnamed: 0,Sentiment,Headline
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
340,positive,"In the reporting period , net sales rose by 8 ..."
341,positive,"In the reporting period , the company 's opera..."
342,positive,Last year 's net sales rose to EUR 68.3 millio...
343,positive,Last year the company raised its turnover to a...


In [94]:
df.drop("Sentiment", axis=1, inplace=True)

In [95]:
import nltk

In [87]:
#This is data cleaning process wherein all the stopwords from the data are removed, the characters are all converted to 
#lower case and stemming has been performed. Later instead of the original tweets, cleaned and preprocessed tweets are stored 
#under the same column heading.
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(len(df)):
  review = re.sub('[^a-zA-Z]', ' ', str(df['Headline'][i]))
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  df['Headline'][i] = review

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DEll\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [96]:
#TextBlob package of Python is used for sentiment analysis of the tweets.
from textblob import TextBlob

# compute sentiment scores (polarity) and labels
for i in range (len(df)) :
   df['sentiment_scores_tb'] = [round(TextBlob(article).sentiment.polarity, 3) for article in df['Headline']]
   df['sentiment_category_tb'] = ['positive' if score > 0 
                             else 'negative' if score < 0 
                                 else 'neutral' 
                                     for score in df['sentiment_scores_tb']]

In [97]:
df

Unnamed: 0,Headline,sentiment_scores_tb,sentiment_category_tb
0,Technopolis plans to develop in stages an area...,0.083,positive
1,The international electronic industry company ...,0.000,neutral
2,With the new production plant the company woul...,-0.065,negative
3,According to the company 's updated strategy f...,0.000,neutral
4,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,0.500,positive
...,...,...,...
340,"In the reporting period , net sales rose by 8 ...",0.112,positive
341,"In the reporting period , the company 's opera...",0.000,neutral
342,Last year 's net sales rose to EUR 68.3 millio...,0.200,positive
343,Last year the company raised its turnover to a...,-0.200,negative


## Now extracting company names from news headlines

In [98]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [104]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [127]:
df['Company'] = ''
for i in range (len(df)):
    doc =  nlp(df.Headline[i])
    for X in doc.ents:
        if X.label_ == 'ORG': 
            df.Company[i] = X.text
    #print([(X.text, X.label_) for X in doc.ents])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Company[i] = X.text


In [143]:
len(df[df['Company'] == ''])

184

In [153]:
df[df['Company']!= ''][['Company' , 'sentiment_scores_tb' , 'sentiment_category_tb']]

Unnamed: 0,Company,sentiment_scores_tb,sentiment_category_tb
0,Technopolis,0.083,positive
1,Elcoteq,0.000,neutral
4,HDI,0.500,positive
5,EUR7,0.000,neutral
6,EUR,0.000,neutral
...,...,...,...
333,Fiskars Brands,0.000,neutral
334,China Mobile,0.117,positive
335,HydroCopper,0.250,positive
339,YIT,0.000,neutral


In [154]:
df[df['Company']!= '']

Unnamed: 0,Headline,sentiment_scores_tb,sentiment_category_tb,Company
0,Technopolis plans to develop in stages an area...,0.083,positive,Technopolis
1,The international electronic industry company ...,0.000,neutral,Elcoteq
4,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,0.500,positive,HDI
5,"For the last quarter of 2010 , Componenta 's n...",0.000,neutral,EUR7
6,"In the third quarter of 2010 , net sales incre...",0.000,neutral,EUR
...,...,...,...,...
333,Fiskars Brands report net sales of EUR 145.8 m...,0.000,neutral,Fiskars Brands
334,GeoSentric Oyj s GeoSolutions Business Unit Gy...,0.117,positive,China Mobile
335,HELSINKI AFX - Outokumpu Technology said it ha...,0.250,positive,HydroCopper
339,"In July-September 2008 , YIT 's net sales incr...",0.000,neutral,YIT


In [163]:
df[(df['sentiment_category_tb']== 'positive') & (df['Company']!= '')]

Unnamed: 0,Headline,sentiment_scores_tb,sentiment_category_tb,Company
0,Technopolis plans to develop in stages an area...,0.083,positive,Technopolis
4,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,0.5,positive,HDI
10,"STORA ENSO , NORSKE SKOG , M-REAL , UPM-KYMMEN...",0.475,positive,UPM
13,Clothing retail chain Sepp+�l+� 's sales incre...,0.6,positive,EUR
16,"HELSINKI ( AFX ) - Shares closed higher , led ...",0.075,positive,Nokia
23,Net sales increased to EUR193 .3 m from EUR179...,0.3,positive,EUR43 .1
29,The Brazilian unit of Finnish security solutio...,0.045,positive,Reseller
37,Commission income increased by 22 % to EUR 4.4...,0.6,positive,EUR
38,"In January , traffic , measured in revenue pas...",0.5,positive,ASK
53,Shares of Standard Chartered ( STAN ) rose 1.2...,0.4,positive,BCS


In [156]:
textblob_sentiment

[['Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .',
  0.08333333333333333,
  0.06666666666666667],
 ['The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .',
  0.0,
  0.16666666666666666],
 ['With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .',
  -0.06480186480186481,
  0.4386946386946387],
 ["According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .",
  0.0,
  0.0],
 ["FINANC

In [164]:
new_df = df.copy()

In [165]:
#This is data cleaning process wherein all the stopwords from the data are removed, the characters are all converted to 
#lower case and stemming has been performed. Later instead of the original tweets, cleaned and preprocessed tweets are stored 
#under the same column heading.
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(len(new_df)):
  review = re.sub('[^a-zA-Z]', ' ', str(new_df['Headline'][i]))
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  new_df['Headline'][i] = review

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DEll\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Headline'][i] = review


In [166]:
new_df

Unnamed: 0,Headline,sentiment_scores_tb,sentiment_category_tb,Company
0,technopoli plan develop stage area less squar ...,0.083,positive,Technopolis
1,intern electron industri compani elcoteq laid ...,0.000,neutral,Elcoteq
2,new product plant compani would increas capac ...,-0.065,negative,
3,accord compani updat strategi year baswar targ...,0.000,neutral,
4,financ aspocomp growth aspocomp aggress pursu ...,0.500,positive,HDI
...,...,...,...,...
340,report period net sale rose year year eur due ...,0.112,positive,
341,report period compani oper profit grew eur mil...,0.000,neutral,
342,last year net sale rose eur million eur million,0.200,positive,EUR
343,last year compani rais turnov approxim million...,-0.200,negative,
