In [79]:
## Libraries and packages

import numpy as np
import pandas as pd
import re

## Loading the data

In [7]:
raw_train_data = pd.read_csv('../data/train_data.csv')
raw_test_data = pd.read_csv('../data/valid_data.csv')

In [8]:
raw_train_data.head()

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0


In [9]:
raw_test_data.head()

Unnamed: 0,text,label
0,Analyst call of the day for @CNBCPro subscribe...,0
1,"Loop upgrades CSX to buy, says it's a good pla...",0
2,BofA believes we're already in a recession — a...,0
3,JPMorgan sees these derivative plays as best w...,0
4,Morgan Stanley's Huberty sees Apple earnings m...,0


In [17]:
raw_train_data.label.unique()

array([ 0,  2,  9,  8,  4,  5,  6,  1,  7, 10, 11, 12, 13, 14, 15, 17, 16,
       18, 19,  3])

In [18]:
## Label list
labels = ["Analyst Update","Fed | Central Banks",
        "Company | Product News","Treasuries | Corporate Debt",
        "Dividend","Earnings","Energy | Oil",
        "Financials","Currencies","General News | Opinion",
        "Gold | Metals | Materials","IPO","Legal | Regulation",
        "M&A | Investments","Macro","Markets","Politics",
        "Personnel Change","Stock Commentary", "Stock Movement"]

# Label dictionary 
# labels = {"LABEL_0": "Analyst Update",
#           "LABEL_1": "Fed | Central Banks",
#           "LABEL_2": "Company | Product News",
#           "LABEL_3": "Treasuries | Corporate Debt",
#           "LABEL_4": "Dividend",
#           "LABEL_5": "Earnings",
#           "LABEL_6": "Energy | Oil",
#           "LABEL_7": "Financials",
#           "LABEL_8": "Currencies",
#           "LABEL_9": "General News | Opinion",
#           "LABEL_10": "Gold | Metals | Materials",
#           "LABEL_11": "IPO",
#           "LABEL_12": "Legal | Regulation",
#           "LABEL_13": "M&A | Investments",
#           "LABEL_14": "Macro",
#           "LABEL_15": "Markets",
#           "LABEL_16": "Politics",
#           "LABEL_17": "Personnel Change",
#           "LABEL_18": "Stock Commentary",
#           "LABEL_19": "Stock Movement"
# }

## Data Preprocessing

In [148]:
## FUNCTION FOR SAMPLING DATA AND VIEWING RESULTS

def preview_random_sample(df, sample_num = 5):
    # A function to grab a random sample and display the text and associated labels

    # VARIABLES
    # df = dataframe
    # col_name = name of column to display
    # sample_num = number of samples to display

    # generating random sample
    random_sample = df.sample(sample_num)

    # looping over sampple an displaying results
    for i, data in enumerate(random_sample.iterrows()):
        print(f'RECORD {i+1}')
        print(f'Text: {data[1].text}')
        print(f'Label: {labels[data[1].label]}\n')

    return 'SAMPLING COMPLETE'

#### Previewing data samples

In [62]:
preview_random_sample(raw_train_data, sample_num=10)

RECORD 1
Text: $DMB - BNY Mellon Municipal Bond Infrastructure Fund goes ex dividend tomorrow  https://t.co/PBp3kdfLJj
Label: Dividend

RECORD 2
Text: Global stock markets down ahead of U.S. inflation data  https://t.co/sc0yVnPN1d
Label: Markets

RECORD 3
Text: $PARA $PARAA $PARAP - How A 15% Yield Creates Losses On Paramount Global Preferred Shares.  https://t.co/3Zud75ViTj #business #economy #investing
Label: Stock Commentary

RECORD 4
Text: $IBKR - Interactive Brokers Q2 earnings miss after daily average revenue trades slip  https://t.co/EBSOYbqZG0
Label: Financials

RECORD 5
Text: The White House expects June’s consumer price index figures to be “highly elevated” as Americans grappled with substantial increases in the cost of gas and food  https://t.co/FjYpouR6la
Label: Macro

RECORD 6
Text: The Dow is up more than 670 points this afternoon  https://t.co/LO3p0AIGKx
Label: Markets

RECORD 7
Text: U.S. Treasury diplomat nominee aims to curb China's lending influence  https://t.co/xSB

'SAMPLING COMPLETE'

#### Removing links

In [77]:
t = raw_train_data.iloc[0]
print(t.text)

Here are Thursday's biggest analyst calls: Apple, Amazon, Tesla, Palantir, DocuSign, Exxon &amp; more  https://t.co/QPN8Gwl7Uh


In [83]:
t1 = re.sub(r'http\S+', '', str(t.text))

In [84]:
t1

"Here are Thursday's biggest analyst calls: Apple, Amazon, Tesla, Palantir, DocuSign, Exxon &amp; more  "

In [85]:
t2 = 'Kite Lake Capital Management (UK) LLP UK Regulatory Announcement: Form 8.3 - Capricorn Energy plc  https://t.co/RlQXQVjcrT  https://t.co/X0B2OO0eck'
t3 = re.sub(r'http\S+','',t2)
t3

'Kite Lake Capital Management (UK) LLP UK Regulatory Announcement: Form 8.3 - Capricorn Energy plc    '

In [94]:
## Function for removing links from text

def remove_links(df, num_links = 5):
    # VARIABLES:
    # df = dataframe column to iterate over
    # num_links = count of regex matches to remove

    cleaned_text = []

    for record in df:
        cleaned_record = re.sub(r'http\S+','',record)
        cleaned_text.append(cleaned_record)
    
    return cleaned_text


In [138]:
## Removing links from train data
clean_train_text = remove_links(raw_train_data['text'])
clean_test_text = remove_links(raw_test_data['text'])

In [139]:
clean_train_text = pd.DataFrame(clean_train_text, columns = ['Tweets'])
clean_test_text = pd.DataFrame(clean_test_text, columns = ['Tweets'])
# cleaned_train_data = raw_train_data.copy()
# cleaned_test_data = raw_test_data.copy()

In [146]:
cleaned_train_data = pd.concat([raw_train_data.copy(), clean_train_text], axis = 1)
cleaned_train_data.columns = ['old text', 'label', 'text']
cleaned_test_data = pd.concat([raw_test_data.copy(), clean_test_text], axis = 1)
cleaned_test_data.columns = ['old text', 'label', 'text']

display(cleaned_train_data.head())
display(cleaned_test_data.head()) 


Unnamed: 0,old text,label,text
0,Here are Thursday's biggest analyst calls: App...,0,Here are Thursday's biggest analyst calls: App...
1,Buy Las Vegas Sands as travel to Singapore bui...,0,Buy Las Vegas Sands as travel to Singapore bui...
2,"Piper Sandler downgrades DocuSign to sell, cit...",0,"Piper Sandler downgrades DocuSign to sell, cit..."
3,"Analysts react to Tesla's latest earnings, bre...",0,"Analysts react to Tesla's latest earnings, bre..."
4,Netflix and its peers are set for a ‘return to...,0,Netflix and its peers are set for a ‘return to...


Unnamed: 0,new text,label,text
0,Analyst call of the day for @CNBCPro subscribe...,0,Analyst call of the day for @CNBCPro subscribe...
1,"Loop upgrades CSX to buy, says it's a good pla...",0,"Loop upgrades CSX to buy, says it's a good pla..."
2,BofA believes we're already in a recession — a...,0,BofA believes we're already in a recession — a...
3,JPMorgan sees these derivative plays as best w...,0,JPMorgan sees these derivative plays as best w...
4,Morgan Stanley's Huberty sees Apple earnings m...,0,Morgan Stanley's Huberty sees Apple earnings m...


In [163]:
## Previewing cleaned tweets
preview_random_sample(cleaned_train_data, sample_num = 5)

RECORD 1
Text: $AFRM with a gap up triangle base break up and out today. None here.  Seems I saw the chart in @traderstewie stream recently.
Label: Stock Commentary

RECORD 2
Text: A cutoff in Russian natural gas supplies could result in a hit of as much as 2.65% to the European Union’s economy, according to an IMF working paper  
Label: Macro

RECORD 3
Text: China’s property downturn showed little signs of improvement in the three months through June  
Label: Macro

RECORD 4
Text: $TPC - Tutor Perini: Time To Execute.   #stocks #markets #trading
Label: Stock Commentary

RECORD 5
Text: Crunch time has finally arrived for Joe Biden’s economic agenda  
Label: Politics



'SAMPLING COMPLETE'

In [164]:
## Previewing cleaned tweets
preview_random_sample(cleaned_test_data, sample_num = 5)

RECORD 1
Text: Midstream/MLPs: Free Cash Flow Powerhouse.   #stockmarket #economy #stocks
Label: Stock Commentary

RECORD 2
Text: CHINESE CITY QINGDAO TO HOLD RCEP HIGH-LEVEL FORUM ON ECONOMIC, TRADE COOPERATION ON JULY 27-29 - STATE MEDIA
Label: Macro

RECORD 3
Text: Uber is being sued by hundreds of women who claim they were sexually assaulted by drivers  
Label: Legal | Regulation

RECORD 4
Text: $CRTO - Berenberg reduced Criteo estimates to reflect significant currency headwinds  
Label: Analyst Update

RECORD 5
Text: Three people have been charged in New York with conspiring to illegally possess about 100 pages of manuscripts related to the Eagles album “Hotel California”  
Label: Legal | Regulation



'SAMPLING COMPLETE'