In [32]:
## Libraries and packages

# data processessing packages
import numpy as np
import pandas as pd
import re

# ml packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

## Loading the data

In [2]:
raw_train_data = pd.read_csv('../data/train_data.csv')
raw_test_data = pd.read_csv('../data/valid_data.csv')

In [3]:
raw_train_data.head()

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0


In [4]:
raw_test_data.head()

Unnamed: 0,text,label
0,Analyst call of the day for @CNBCPro subscribe...,0
1,"Loop upgrades CSX to buy, says it's a good pla...",0
2,BofA believes we're already in a recession — a...,0
3,JPMorgan sees these derivative plays as best w...,0
4,Morgan Stanley's Huberty sees Apple earnings m...,0


In [5]:
# Combining data for pre-processing and encoding
raw_data = pd.concat([raw_train_data, raw_test_data], axis=0, ignore_index=True)

print(raw_data.shape)
raw_data

(21107, 2)


Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0
...,...,...
21102,Dollar bonds of Chinese developers fall as str...,3
21103,Longer maturity Treasury yields have scope to ...,3
21104,Pimco buys €1bn of Apollo buyout loans from ba...,3
21105,Analysis: Banks' snubbing of junk-rated loan f...,3


In [6]:
raw_data.label.unique()

array([ 0,  2,  9,  8,  4,  5,  6,  1,  7, 10, 11, 12, 13, 14, 15, 17, 16,
       18, 19,  3])

In [7]:
## Label list
labels = ["Analyst Update","Fed | Central Banks",
        "Company | Product News","Treasuries | Corporate Debt",
        "Dividend","Earnings","Energy | Oil",
        "Financials","Currencies","General News | Opinion",
        "Gold | Metals | Materials","IPO","Legal | Regulation",
        "M&A | Investments","Macro","Markets","Politics",
        "Personnel Change","Stock Commentary", "Stock Movement"]

# Label dictionary 
# labels = {"LABEL_0": "Analyst Update",
#           "LABEL_1": "Fed | Central Banks",
#           "LABEL_2": "Company | Product News",
#           "LABEL_3": "Treasuries | Corporate Debt",
#           "LABEL_4": "Dividend",
#           "LABEL_5": "Earnings",
#           "LABEL_6": "Energy | Oil",
#           "LABEL_7": "Financials",
#           "LABEL_8": "Currencies",
#           "LABEL_9": "General News | Opinion",
#           "LABEL_10": "Gold | Metals | Materials",
#           "LABEL_11": "IPO",
#           "LABEL_12": "Legal | Regulation",
#           "LABEL_13": "M&A | Investments",
#           "LABEL_14": "Macro",
#           "LABEL_15": "Markets",
#           "LABEL_16": "Politics",
#           "LABEL_17": "Personnel Change",
#           "LABEL_18": "Stock Commentary",
#           "LABEL_19": "Stock Movement"
# }

## Data Preprocessing

In [8]:
## FUNCTION FOR SAMPLING DATA AND VIEWING RESULTS

def preview_random_sample(df, sample_num = 5):
    # A function to grab a random sample and display the text and associated labels

    # VARIABLES
    # df = dataframe
    # col_name = name of column to display
    # sample_num = number of samples to display

    # generating random sample
    random_sample = df.sample(sample_num)

    # looping over sampple an displaying results
    for i, data in enumerate(random_sample.iterrows()):
        print(f'RECORD {i+1}')
        print(f'Text: {data[1].text}')
        print(f'Label: {labels[data[1].label]}\n')

    return 'SAMPLING COMPLETE'

#### Previewing data samples

In [9]:
preview_random_sample(raw_data, sample_num=10)

RECORD 1
Text: Mirakl Partners with eDesk to Empower Sellers in Their Marketplace Expansion  https://t.co/dob2y4Lou3  https://t.co/Nh3GYBvE7O
Label: Company | Product News

RECORD 2
Text: $HOPE - Hope Bancorp, Inc. 2022 Q2 - Results - Earnings Call Presentation.  https://t.co/lSol5cMuVH #finance #economy #markets
Label: Earnings

RECORD 3
Text: Westlake Announces Second Quarter 2022 Earnings Conference Call  https://t.co/MdpVeSHvFE  https://t.co/pKvRfJGi3L
Label: Earnings

RECORD 4
Text: The Saudi crown prince’s urban megaproject is supposed to have a ski resort, swim lanes for commuters, and “smart” everything. It’s going great—for the consultants  https://t.co/QILN5VT44s
Label: General News | Opinion

RECORD 5
Text: $BA stock rises on its $DAL jet deal, bank stocks $GS and $BAC rise after posting Q2 earnings, and $GE announces the new names for its spin-off companies.  https://t.co/5QRybgHIim
Label: Stock Movement

RECORD 6
Text: Twitter v. Musk: Can the billionaire be forced to buy 

'SAMPLING COMPLETE'

#### Removing links

In [10]:
## Function for removing links from text

def remove_links(df, num_links = 5):
    # VARIABLES:
    # df = dataframe column to iterate over
    # num_links = count of regex matches to remove

    cleaned_text = []

    for record in df:
        cleaned_record = re.sub(r'http\S+','',record)
        cleaned_text.append(cleaned_record)
    
    return cleaned_text


In [11]:
# Removing links from train data
clean_train_text = remove_links(raw_data['text'])

# turning list into df
clean_train_text = pd.DataFrame(clean_train_text, columns = ['Tweets'])

# concatenating cleaned tweets with original df
cleaned_train_data = pd.concat([raw_data, clean_train_text], axis=1)

# relabeling column names
cleaned_train_data.columns = ['original text', 'label', 'text']

display(cleaned_train_data.head())

Unnamed: 0,original text,label,text
0,Here are Thursday's biggest analyst calls: App...,0,Here are Thursday's biggest analyst calls: App...
1,Buy Las Vegas Sands as travel to Singapore bui...,0,Buy Las Vegas Sands as travel to Singapore bui...
2,"Piper Sandler downgrades DocuSign to sell, cit...",0,"Piper Sandler downgrades DocuSign to sell, cit..."
3,"Analysts react to Tesla's latest earnings, bre...",0,"Analysts react to Tesla's latest earnings, bre..."
4,Netflix and its peers are set for a ‘return to...,0,Netflix and its peers are set for a ‘return to...


In [12]:
## Previewing cleaned tweets
preview_random_sample(cleaned_train_data, sample_num = 5)

RECORD 1
Text: Australia detects traces of foot-and-mouth disease on imported animal products, deepening fears about a potential outbreak that could devastate the nation’s livestock industry  
Label: General News | Opinion

RECORD 2
Text: $XLK: Sector Briefing: Technology  
Label: Stock Commentary

RECORD 3
Text: Caution begins to emerge as a theme during this earnings season  
Label: Earnings

RECORD 4
Text: $ET - Energy Transfer: Amazing Deal, Pay Nothing For Their Upcoming Distribution Growth.   #markets #stocks #investing
Label: Stock Commentary

RECORD 5
Text: @FatIrish66 Fed certainly not defending stocks lol - they are trying to take it lower and unable too
Label: Fed | Central Banks



'SAMPLING COMPLETE'

#### Generate training/testing and labels

In [13]:
## Function for selecting/encoding features and labels

def model_data_labels(df, features = 'text', labels = 'label'):
    # VARIABLES:
    # df = dataframe
    # features = name of column for features in df
    # labels = name of column for labels in df

    # selecting features and labels
    x = df[features]
    y = df[labels]

    # encoding features
    count_vect = CountVectorizer()
    x_word_count = count_vect.fit_transform(x)
    tf_transformer = TfidfTransformer(use_idf=True).fit(x_word_count)
    x_enc_tf = tf_transformer.transform(x_word_count)

    # creating train/test split
    x_train, x_test, y_train, y_test = train_test_split(x_enc_tf, y, test_size=.15, random_state=42)

    print(f'x_train shape:\t\t\t\t{x_train.shape}\n' +
          f'y_train shape:\t\t\t\t{y_train.shape}\n' + 
          f'x_test shape:\t\t\t\t{x_test.shape}\n' + 
          f'y_test shape:\t\t\t\t{y_test.shape}\n' )

    return  x_train, x_test, y_train, y_test



In [14]:
x_train, x_test, y_train, y_test = model_data_labels(cleaned_train_data)

x_train shape:				(17940, 25414)
y_train shape:				(17940,)
x_test shape:				(3167, 25414)
y_test shape:				(3167,)



## Baselines

In [None]:
# Instantiate an RBF SVM
svm = SVC()

# Instantiate the GridSearchCV object and run the search
parameters = {'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1]}
searcher = GridSearchCV(svm, parameters)
searcher.fit(X,y)

In [39]:
clf = SVC()
clf.fit(x_train, y_train)

In [38]:
clf.score(x_test, y_test)

0.20397852857593937

In [17]:
sgd_clf = SGDClassifier()
sgd_clf.fit(x_train, y_train)

In [18]:
str(sgd_clf)

'SGDClassifier()'

In [19]:
sgd_clf.score(x_test, y_test)

0.8279128512788128

In [20]:
# function for running multiple classification models
def run_cls_models(models=[], x_train=x_train,  x_test=x_test, y_train=y_train, y_test=y_test):
    # VARIABLES
    # models = models to loop over
    # x/y train/test = train and testing data

    for model in models:
        model_init = model()
        model_init.fit(x_train, y_train)
        test_accuracy = model_init.score(x_test, y_test)
        print(f"{str(model_init)}\nTest Accuracy: {test_accuracy}\n")


In [21]:
cls_models = [SGDClassifier, SVC, LinearSVC]

In [22]:
run_cls_models(cls_models, x_train, x_test, y_train, y_test)

SGDClassifier()
Test Accuracy: 0.8269655825702558

SVC()
Test Accuracy: 0.8083359646353016

LinearSVC()
Test Accuracy: 0.8478054941585096



In [33]:
lr = LogisticRegression(multi_class='ovr')
lr.fit(x_train, y_train)

In [34]:
lr.score(x_test, y_test)

0.7682349226397222

In [26]:
OvR_SVC = OneVsRestClassifier(SVC()).fit(x_train, y_train)

In [27]:
OvR_SVC.score(x_test, y_test)

0.839911588253868

In [28]:
OvR_LinearSVC = OneVsRestClassifier(LinearSVC()).fit(x_train, y_train)

In [29]:
OvR_LinearSVC.score(x_test, y_test)

0.8478054941585096

In [30]:
OvR_SGD = OneVsRestClassifier(SGDClassifier()).fit(x_train, y_train)

In [31]:
OvR_SGD.score(x_test, y_test)

0.8257025576255131