In [1]:
import pandas as pd
from bs4 import BeautifulSoup 
df = pd.read_csv("stock_news.csv",)

In [2]:
df.dtypes

Unnamed: 0     int64
headline      object
label         object
dtype: object

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,headline,label
0,0,"Markets Close Mostly Lower Again; ROST, PANW R...",Negative
1,1,"Gap plummets on earnings miss, cuts full-year ...",Negative
2,2,Billionaire Ken Fisher is Selling These 10 Stocks,Negative
3,3,"Corning net income drops 13%, shares fall",Negative
4,4,Internet Explorer shutdown to cause Japan prob...,Negative


In [5]:
df.tail()

Unnamed: 0.1,Unnamed: 0,headline,label
25995,25995,"Boston Scientific (BSX) Q1 Earnings Top, 2022 ...",Positive
25996,25996,Producer sentiment improves with strengthened ...,Positive
25997,25997,Equinix Declares Quarterly Dividend on Its Com...,Positive
25998,25998,FEATURE-'Love Island' dumps fast fashion for s...,Positive
25999,25999,"After Plunging 15.5% in 4 Weeks, Here's Why th...",Positive


In [9]:
df["headline"] = df["headline"].astype("string")

In [10]:
df.dtypes

headline    string[python]
label               object
dtype: object

In [11]:
urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
userPattern       = '@[^\s]+'
hashtagPattern    = '#[^\s]+'
alphaPattern      = "[^a-z0-9<>]"
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

import re

In [13]:
df["headline"]= df["headline"].str.lower()

In [14]:
df

Unnamed: 0,headline,label
0,"markets close mostly lower again; rost, panw r...",Negative
1,"gap plummets on earnings miss, cuts full-year ...",Negative
2,billionaire ken fisher is selling these 10 stocks,Negative
3,"corning net income drops 13%, shares fall",Negative
4,internet explorer shutdown to cause japan prob...,Negative
...,...,...
25995,"boston scientific (bsx) q1 earnings top, 2022 ...",Positive
25996,producer sentiment improves with strengthened ...,Positive
25997,equinix declares quarterly dividend on its com...,Positive
25998,feature-'love island' dumps fast fashion for s...,Positive


## Removing Punctuatuation

In [15]:
import string
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,"")
        
    return text

df["headline"] = df["headline"].apply(remove_punc)

In [32]:
df

Unnamed: 0,Sentiment,Sentence
0,neutral,technopolis plans to develop in stages an area...
1,negative,the international electronic industry company ...
2,positive,with the new production plant the company woul...
3,positive,according to the company s updated strategy fo...
4,positive,financing of aspocomp s growth aspocomp is agg...
...,...,...
4840,negative,london marketwatch share prices ended lower i...
4841,neutral,rinkuskiai s beer sales fell by 65 per cent to...
4842,negative,operating profit fell to eur 354 mn from eur 6...
4843,negative,net sales of the paper segment decreased to eu...


## Removing Stopwords

In [18]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



In [19]:
stop_words = stopwords.words("english")
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [20]:
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_sentence = [x for x in tokens if x.lower() not in stop_words]
    return " ".join(filtered_sentence)

In [21]:
df["headline"] = df["headline"].apply(remove_stopwords)

In [22]:
df

Unnamed: 0,headline,label
0,markets close mostly lower rost panw report ea...,Negative
1,gap plummets earnings miss cuts fullyear forecast,Negative
2,billionaire ken fisher selling 10 stocks,Negative
3,corning net income drops 13 shares fall,Negative
4,internet explorer shutdown cause japan problem...,Negative
...,...,...
25995,boston scientific bsx q1 earnings top 2022 rev...,Positive
25996,producer sentiment improves strengthened commo...,Positive
25997,equinix declares quarterly dividend common stock,Positive
25998,featurelove island dumps fast fashion secondha...,Positive


## Correcting Sentences

In [38]:
from joblib import Parallel,delayed

In [39]:
from textblob import TextBlob

def correct(text):
    textblob = TextBlob(text)
    corrected = textblob.correct().string
    return corrected
    

In [40]:
df["Sentence"] = Parallel(n_jobs=-1)(delayed(correct)(text) for text in df["Sentence"])



In [41]:
df

Unnamed: 0,Sentiment,Sentence
0,neutral,technopolis plans develop stages area less 100...
1,negative,international electronic industry company elco...
2,positive,new production plant company would increase ca...
3,positive,according company updated strategy years 20092...
4,positive,financing aspocomp growth aspocomp aggressive ...
...,...,...
4840,negative,london marketwatch share prices ended lower lo...
4841,neutral,rinkuskiai beer sales fell 65 per cent 416 mil...
4842,negative,operating profit fell our 354 in our 688 in 20...
4843,negative,net sales paper segment decreased our 2216 in ...


## Lemetization

In [23]:
import spacy

In [24]:
nlp = spacy.load("en_core_web_sm")
def lemmetize(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    lemmatized_text = ' '.join(lemmas)
    return lemmatized_text

In [25]:
df["headline"] = df["headline"].apply(lemmetize)

In [31]:
df

Unnamed: 0,headline,label
0,market close mostly low rost panw report earning,Negative
1,gap plummet earning miss cut fullyear forecast,Negative
2,billionaire ken fisher sell 10 stock,Negative
3,corn net income drop 13 share fall,Negative
4,internet explorer shutdown cause japan problem...,Negative
...,...,...
25995,boston scientific bsx q1 earning top 2022 reve...,Positive
25996,producer sentiment improves strengthen commodi...,Positive
25997,equinix declare quarterly dividend common stock,Positive
25998,featurelove island dump fast fashion secondhan...,Positive


In [28]:
df.to_csv("headlines_refined.csv")

In [29]:
d2 = pd.read_csv("headlines_refined.csv")

In [30]:
d2

Unnamed: 0.1,Unnamed: 0,headline,label
0,0,market close mostly low rost panw report earning,Negative
1,1,gap plummet earning miss cut fullyear forecast,Negative
2,2,billionaire ken fisher sell 10 stock,Negative
3,3,corn net income drop 13 share fall,Negative
4,4,internet explorer shutdown cause japan problem...,Negative
...,...,...,...
25995,25995,boston scientific bsx q1 earning top 2022 reve...,Positive
25996,25996,producer sentiment improves strengthen commodi...,Positive
25997,25997,equinix declare quarterly dividend common stock,Positive
25998,25998,featurelove island dump fast fashion secondhan...,Positive
