In [2]:
import re
import os
import time
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import random
import numpy as np
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.tokenize import word_tokenize,sent_tokenize,RegexpTokenizer
from nltk.classify import NaiveBayesClassifier
from nltk import pos_tag
import pandas as pd
import nltk
from nltk.tag import StanfordNERTagger
from sklearn.model_selection import train_test_split
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
import ast
import itertools
import unidecode
import ftfy
import glob

# Data Preprocessing
### Load Data

In [3]:
#read CEO name from CSV, put into list

data_ceo = pd.read_csv('./ceo.csv',delimiter=',',header=None , encoding='gbk')

ceo_list = []
for i in range(data_ceo.shape[0]):
    if data_ceo.iloc[i:i+1,1].any()!=False:
        ceo_list.append(str(data_ceo.iloc[i:i+1,0].values.tolist()+data_ceo.iloc[i:i+1,1].values.tolist()).replace(',','').replace('\'','').replace('[','').replace(']',''))
    else :
        ceo_list.append(str(data_ceo.iloc[i:i+1,0].values.tolist()).replace(',','').replace('\'','').replace('[','').replace(']',''))

In [4]:
#read company name from CSV, put into list

data_company = pd.read_csv('./companies.csv',delimiter='/t',header=None , encoding='gbk',engine='python')

company_list = []
for i in range(data_company.shape[0]):
    company_list.append(str(data_company.iloc[i:i+1,0].values.tolist()).replace(',','').replace('\'','').replace('[','').replace(']',''))

In [5]:
#read percentage data from CSV, put into list

data_percent = pd.read_csv('./percentage.csv',delimiter=',',header=None, encoding = "latin-1")

percent_list = []
for i in range(data_percent.shape[0]):
    percent_list.append(str(data_percent.iloc[i:i+1,0].values.tolist()).replace(',','').replace('\'','').replace('[','').replace(']',''))

In [6]:
ceo_list[:3]
company_list[:3]
percent_list[:3]
print("Top three CEOs:" + str(ceo_list[:3]))
print("Top three companys:" + str(company_list[:3]))
print("Top three percentages:" + str(percent_list[:3]))

Top three CEOs:['Tom Horton', 'Patti Hart', 'Jamie Dimon']
Top three companys:['Abaxis Inc', 'ACA Financial', 'Alibaba Group Holding Ltd']
Top three percentages:['66%', '40%', '90%']


### Store all txt content into one TXT

In [6]:
#Get all file names

all_files2013 = []
for filename in os.listdir('./2013'):
    all_files2013.append('./2013/' + filename)

all_files2014 = []
for filename in os.listdir('./2014'):
    all_files2014.append('./2014/' + filename)

txtname_list = all_files2013+all_files2014

In [7]:
#Store all content in one file

dicts = []
for i , file in enumerate(txtname_list):
    with open(file,encoding='utf-8',errors='ignore') as f:
        articles = []
        for article in f :
            article = sent_tokenize(article)
            articles.append(article)
    dicts.append(articles)
with open('all_news.txt','w',encoding='utf-8')as aa:
    aa.write(str(dicts))

### Store all txt content into one CSV

In [28]:
#Load all_articles.txt
with open('all_news.txt','r', encoding='utf-8' , errors='ignore') as an:
    corpus = ast.literal_eval(an.read())

In [29]:
corpus = list(itertools.chain(*corpus))

In [30]:
corpus[:1]

[['Earlier today we had a strong South Korean PMI report.',
  'The latest?',
  'Taiwan.',
  'It just saw a rise in December PMI from 47.4 to 50.6.',
  'From the report:']]

In [31]:
article_sentence_list = [x for sublist in corpus for x in sublist]

In [32]:
#Transfer to dataframe
article_sentence_dataframe = pd.DataFrame({'Article':corpus})
article_sentence_dataframe.to_csv("article_sentence_dataframe.csv", index = False)

In [33]:
#Store all sentences to one CSV, one sentence for one line
articles = pd.DataFrame(article_sentence_list)
articles.to_csv("article_sentence_list.csv", index = False)

In [34]:
print("All sentences in list:")
article_sentence_list[1]

All sentences in list:


'The latest?'

In [35]:
print("All sentences in dataframe:")
article_sentence_dataframe

All sentences in dataframe:


Unnamed: 0,Article
0,[Earlier today we had a strong South Korean PM...
1,[With the House prepared to vote on the Senate...
2,"[Good news for the global economy., South Kore..."
3,[UPDATE: As the Fiscal Cliff bill gets closer ...
4,[Taxes increased for almost all Americans at m...
...,...
35893,[Thomson ReutersFile photo of Mexico's state-r...
35894,[]
35895,[(Reuters) - Revelers ringing in of the new ye...
35896,[Washington (AFP) - The International Monetary...


# Extract Percentages

In [36]:
sentence = [x[0] for x in articles.values if x is not None]

In [37]:
#Sort out percentage format

percentage_pattern_1 = f"\d+(?:\.\d+)?(?:%| percent?)"
percentage_pattern_2 = f"\d+(?:\.\d+)?(?:%| percentage points?)"
percentage_pattern_3 = f"-\d+(?:\.\d+)?(?:%| percentage points?)"
percentage_pattern_4 = f"-\d+(?:\.\d+)?(?:%| percent?)"

In [38]:
#Find percentages in all sentences and store in "match_percent"

match_percent = []
for sentences in sentence:
   
    percentage_1 = re.findall(percentage_pattern_1, sentences)
    percentage_2 = re.findall(percentage_pattern_2, sentences)
    percentage_3 = re.findall(percentage_pattern_3, sentences)
    percentage_4 = re.findall(percentage_pattern_4, sentences)
    percents = list(set(percentage_1 + percentage_2 +percentage_3 + percentage_4 ))
    
    if percents:
        for percentage in percents:
            match_percent.append(percentage)

In [19]:
print("Top three percentages:" + str(match_percent[:3]))

Top three percentages:['1.2%', '47%', '39.6%']


In [38]:
pd.Series(list(set(match_percent))).to_csv("final_percent.csv", index = False, header = False)

# Extract Company & CEO_Part 1

In [7]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import logging
import gensim

### Word2Vec Continuous bag of words

In [8]:
#Vectorize words in the article

word_to_vec = open('./all_news.txt',encoding="utf-8",errors = "ignore")
model = Word2Vec(LineSentence(word_to_vec) , size = 100 , window=10 , min_count=3,workers= 15,sample=1e-3 )

In [22]:
#Test "Steve Jobs" as vector

print ("Steve Jobs:")
print (model['Steve'] +model['Jobs'])

Steve Jobs:
[-2.9597616  -1.5068251   0.1340813  -2.3553498   0.3226763   0.27790716
 -1.3274721   2.4182012  -3.1326528  -1.1297013   0.8322681   3.484494
 -1.6384027  -1.9782174   1.5027729   2.4033165   1.8217999   3.4699244
 -3.0009415   1.7511106  -1.1517495  -5.3150554  -5.1746283   1.553317
  1.0321525  -2.8407555   2.3095436  -0.564881    4.644501   -0.35152698
  2.2184474  -1.6829476  -6.235709   -2.966454   -1.977063   -2.3877914
  1.4012516  -2.4139614  -2.8470974  -1.0403636  -6.4503407  -1.8489287
  3.724516    0.5838772   5.051236    2.0116074  -5.905501    0.85073215
 -3.3060398   2.716598   -0.51692057 -0.44450846 -0.8391253   3.5039253
  0.85933787  2.7541132   1.6094096   2.8886068  -0.42764357 -2.6095579
 -0.63557595  3.834567   -0.06438978  0.7030156  -1.6113689   1.0189065
 -1.7417018   2.9895082  -0.30642456 -4.483951   -0.09964609  0.8489973
 -0.7045318   1.9155669   1.8483373   0.21476877  2.1247215  -1.0979493
  0.32794967 -1.8213513  -2.5131197   0.02347901 -5

  after removing the cwd from sys.path.


### Numerical feature embedding

In [9]:
company_vec = []
for index in range(len(company_list)):
    str_com = str(company_list[index])
    word = word_tokenize(str_com) #word_tokenize applt to string, get outcome in list
    for i in range(len(word)):
        try:
            if len(word) == 1:
                company_vec.append(model[str(word[0])])
            elif len(word) == 2:
                company_vec.append(model[str(word[0])]+model[str(word[1])])
            elif len(word) == 3:
                company_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])])
            elif len(word) == 4:
                company_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])])
            elif len(word) == 5:
                company_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])])
            elif len(word) == 6:
                company_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])]+model[str(word[5])])
        except KeyError:
            pass
        continue

  # Remove the CWD from sys.path while we load stuff.
  
  if sys.path[0] == '':
  
  app.launch_new_instance()


In [10]:
ceo_vec = []
for index in range(len(ceo_list)):
    str_com = str(ceo_list[index])
    word = word_tokenize(str_com)
    for i in range(len(word)):
        try:
            if len(word) == 1:
                ceo_vec.append(model[str(word[0])])
            elif len(word) == 2:
                ceo_vec.append(model[str(word[0])]+model[str(word[1])])
            elif len(word) == 3:
                ceo_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])])
            elif len(word) == 4:
                ceo_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])])
            elif len(word) == 5:
                ceo_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])])
            elif len(word) == 6:
                ceo_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])]+model[str(word[5])])
            else :
                print ('You missed me，I have'+str(len(word))+'characters')
        except KeyError: #ignore word in csv but not in txt
            pass
        continue

  # Remove the CWD from sys.path while we load stuff.
  
  if sys.path[0] == '':
  
  app.launch_new_instance()


In [11]:
percent_vec = []
for index in range(len(percent_list)):
    str_com = str(percent_list[index])
    word = word_tokenize(str_com)
    for i in range(len(word)):
        try:
            if len(word) == 1:
                percent_vec.append(model[str(word[0])])
            elif len(word) == 2:
                percent_vec.append(model[str(word[0])]+model[str(word[1])])
            elif len(word) == 3:
                percent_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])])
            elif len(word) == 4:
                percent_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])])
            elif len(word) == 5:
                percent_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])])
            elif len(word) == 6:
                percent_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])]+model[str(word[5])])
            else :
                print ('You missed me，I have'+str(len(word))+'characters')
        except KeyError: #ignore word in csv but not in txt
            pass
        continue

  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  
  
  app.launch_new_instance()


You missed me，I have7characters
You missed me，I have7characters
You missed me，I have7characters
You missed me，I have7characters
You missed me，I have7characters
You missed me，I have7characters
You missed me，I have7characters
You missed me，I have8characters
You missed me，I have8characters
You missed me，I have8characters
You missed me，I have8characters
You missed me，I have8characters
You missed me，I have8characters
You missed me，I have8characters
You missed me，I have8characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You missed me，I have9characters
You miss

In [25]:
print("Company Name Vector:" )
company_vec[3]

Company Name Vector:


array([ 4.3541946e+00,  3.4193377e+00, -3.5411949e+00, -3.9527938e+00,
        2.5740757e+00,  2.4453731e+00, -3.9481349e+00, -3.7773591e-01,
        3.2008009e+00, -1.2572389e+00, -5.4137173e+00, -4.2864728e+00,
       -1.6015115e+00, -6.4008164e+00,  5.8948398e-01,  3.3943436e+00,
        3.7794540e+00,  1.3823633e+00, -1.0662446e+01,  6.5635319e+00,
       -3.5857491e+00,  1.8158191e+00, -7.0564833e+00,  6.6540208e+00,
       -3.5353794e+00, -7.2521267e+00, -1.8367238e+00,  3.4347782e+00,
        2.2410793e+00, -2.1735854e+00,  3.7985950e+00,  9.9532471e+00,
       -1.1499087e+01, -1.1635940e+00,  2.3879995e+00, -3.8141751e+00,
        3.4086905e+00, -3.3996701e+00, -9.9246197e+00, -5.1632462e+00,
        2.2573557e+00, -7.3076849e+00,  8.9233065e+00, -4.2628422e+00,
        7.9283743e+00,  1.0616806e+00, -6.0110292e+00, -1.0453841e-01,
        6.3064222e+00,  9.7293491e+00,  7.5068822e+00,  1.8294439e+00,
        3.8762760e+00,  1.7198867e+00, -8.9983683e+00,  3.2940216e+00,
      

In [26]:
print("CEO Name Vector:" )
ceo_vec[3]

CEO Name Vector:


array([ 0.10866289,  0.07184193,  0.03690156,  0.15819976,  0.19415161,
        0.12825249,  0.16493034, -0.00368723,  0.01251288, -0.19185314,
        0.05193966,  0.09454677,  0.12402786, -0.01694496, -0.04928596,
        0.369016  ,  0.19174726,  0.28353015, -0.24365726, -0.20929348,
       -0.13468644, -0.31385455, -0.1169305 ,  0.30654198,  0.20671454,
       -0.03087126, -0.15253854,  0.08482849,  0.06693047, -0.09682461,
       -0.0784307 , -0.27427223, -0.49015403, -0.21063617, -0.00978877,
       -0.29138982,  0.10826781, -0.24006122, -0.07915772, -0.21499208,
       -0.49883217, -0.16317767,  0.05221057, -0.08790493,  0.39464283,
        0.21299632, -0.00931131,  0.2901436 , -0.2287218 ,  0.2726487 ,
       -0.01445492,  0.23405373, -0.07543539,  0.20386091, -0.03464732,
        0.38726863,  0.2227025 , -0.02045513, -0.07711802, -0.01962579,
        0.01451896,  0.1242757 , -0.00133781,  0.02844796, -0.4148632 ,
       -0.0905897 ,  0.06315812, -0.31952247, -0.07057078, -0.24

In [28]:
print ("Percent Vector:")
percent_vec[3]

Percent Vector:


array([-0.48897105,  0.612985  ,  1.1968249 ,  1.5998785 , -0.5037883 ,
       -1.8469266 ,  1.6279854 , -1.6520817 , -2.8811524 , -1.0287515 ,
       -0.01121452,  0.60431033,  3.942523  , -1.6251485 , -1.1037431 ,
       -2.603001  ,  0.99345964, -2.1097896 , -1.6698667 , -2.7025144 ,
       -1.1595101 ,  2.2526019 , -1.8374151 , -1.330566  ,  0.49782833,
        1.2867467 ,  0.7151548 ,  1.4857168 ,  1.1034298 , -3.1248085 ,
       -2.0425394 ,  3.9261618 , -3.0643291 ,  0.86246556,  2.6916492 ,
       -1.7372695 ,  0.10853331, -1.9266554 ,  1.6921926 , -0.76491106,
       -1.8900174 ,  0.12058324, -1.3849692 ,  0.21291739, -1.3260963 ,
        0.6917497 , -3.3894885 , -1.4158838 ,  3.8485916 , -0.36665553,
       -0.84538394,  2.1442897 , -0.8289734 ,  0.43705165,  0.7462203 ,
        1.0352986 , -1.6259255 , -2.391779  ,  0.19479391, -2.493569  ,
        0.41686988,  1.5014886 , -0.04474411, -0.2648189 , -1.4225856 ,
       -0.03805172, -4.0777583 , -2.8481195 , -3.6814265 ,  3.44

In [31]:
#Find likely Percent
percent_like = []
for i in range(articles.shape[0]):
    if i % 7000 == 0:
        print (i/719566)
    match_find_percentage_1 = re.findall(percentage_pattern_1, str(articles.iloc[i:i+1,0]))
    match_find_percentage_2 = re.findall(percentage_pattern_2, str(articles.iloc[i:i+1,0]))
    match_find_percentage_3 = re.findall(percentage_pattern_3, str(articles.iloc[i:i+1,0]))
    match_find_percentage_4 = re.findall(percentage_pattern_4, str(articles.iloc[i:i+1,0]))
    if match_find_percentage_1:
        percent_like.append(match_find_percentage_1)
    if match_find_percentage_2:
        percent_like.append(match_find_percentage_2)
    if match_find_percentage_3:
        percent_like.append(match_find_percentage_3)
    if match_find_percentage_4:
        percent_like.append(match_find_percentage_4)
    

0.0
9.728086096341406e-05
0.00019456172192682813
0.00029184258289024215
0.00038912344385365626
0.0004864043048170703
0.0005836851657804843
0.0006809660267438984
0.0007782468877073125
0.0008755277486707265
0.0009728086096341406
0.0010700894705975546
0.0011673703315609686
0.0012646511925243828
0.0013619320534877968
0.0014592129144512108
0.001556493775414625
0.001653774636378039
0.001751055497341453
0.001848336358304867
0.0019456172192682812
0.002042898080231695
0.0021401789411951092
0.0022374598021585234
0.002334740663121937
0.0024320215240853514
0.0025293023850487656
0.0026265832460121794
0.0027238641069755936
0.002821144967939008
0.0029184258289024216
0.003015706689865836
0.00311298755082925
0.003210268411792664
0.003307549272756078
0.003404830133719492
0.003502110994682906
0.0035993918556463203
0.003696672716609734
0.0037939535775731482
0.0038912344385365625
0.003988515299499977
0.00408579616046339
0.004183077021426804
0.0042803578823902184
0.004377638743353633
0.004474919604317047
0.

0.0699449390326947
0.07004221989365812
0.07013950075462154
0.07023678161558496
0.07033406247654836
0.07043134333751178
0.07052862419847519
0.07062590505943861
0.07072318592040203
0.07082046678136543
0.07091774764232885
0.07101502850329226
0.07111230936425568
0.0712095902252191
0.0713068710861825
0.07140415194714592
0.07150143280810933
0.07159871366907275
0.07169599453003617
0.07179327539099957
0.07189055625196299
0.0719878371129264
0.07208511797388982
0.07218239883485324
0.07227967969581664
0.07237696055678006
0.07247424141774347
0.07257152227870689
0.07266880313967031
0.07276608400063371
0.07286336486159713
0.07296064572256054
0.07305792658352396
0.07315520744448738
0.07325248830545078
0.0733497691664142
0.07344705002737761
0.07354433088834103
0.07364161174930445
0.07373889261026785
0.07383617347123127
0.07393345433219468
0.0740307351931581
0.07412801605412152
0.07422529691508492
0.07432257777604834
0.07441985863701175
0.07451713949797517
0.07461442035893859
0.07471170121990199
0.0748

0.11809896520958467
0.11819624607054809
0.11829352693151149
0.1183908077924749
0.11848808865343832
0.11858536951440174
0.11868265037536516
0.11877993123632856
0.11887721209729198
0.11897449295825539
0.11907177381921881
0.11916905468018223
0.11926633554114563
0.11936361640210905
0.11946089726307246
0.11955817812403588
0.1196554589849993
0.1197527398459627
0.11985002070692612
0.11994730156788953
0.12004458242885295
0.12014186328981637
0.12023914415077977
0.12033642501174319
0.1204337058727066
0.12053098673367002
0.12062826759463344
0.12072554845559684
0.12082282931656026
0.12092011017752367
0.12101739103848709
0.12111467189945051
0.12121195276041391
0.12130923362137733
0.12140651448234074
0.12150379534330416
0.12160107620426758
0.12169835706523098
0.1217956379261944
0.12189291878715781
0.12199019964812123
0.12208748050908465
0.12218476137004805
0.12228204223101147
0.12237932309197488
0.1224766039529383
0.12257388481390172
0.12267116567486512
0.12276844653582854
0.12286572739679195
0.1229

0.16839317032766973
0.16849045118863315
0.16858773204959657
0.16868501291055998
0.1687822937715234
0.16887957463248682
0.1689768554934502
0.16907413635441362
0.16917141721537704
0.16926869807634046
0.16936597893730387
0.1694632597982673
0.1695605406592307
0.16965782152019412
0.16975510238115754
0.16985238324212096
0.16994966410308435
0.17004694496404776
0.17014422582501118
0.1702415066859746
0.17033878754693801
0.17043606840790143
0.17053334926886485
0.17063063012982826
0.17072791099079168
0.1708251918517551
0.1709224727127185
0.1710197535736819
0.17111703443464532
0.17121431529560874
0.17131159615657215
0.17140887701753557
0.171506157878499
0.1716034387394624
0.17170071960042582
0.17179800046138924
0.17189528132235263
0.17199256218331604
0.17208984304427946
0.17218712390524288
0.1722844047662063
0.1723816856271697
0.17247896648813313
0.17257624734909655
0.17267352821005996
0.17277080907102338
0.17286808993198677
0.17296537079295018
0.1730626516539136
0.17315993251487702
0.173257213375

0.2124614003440963
0.21255868120505972
0.21265596206602314
0.21275324292698655
0.21285052378794997
0.21294780464891339
0.21304508550987677
0.2131423663708402
0.2132396472318036
0.21333692809276703
0.21343420895373044
0.21353148981469386
0.21362877067565728
0.2137260515366207
0.2138233323975841
0.21392061325854753
0.21401789411951092
0.21411517498047433
0.21421245584143775
0.21430973670240117
0.21440701756336458
0.214504298424328
0.21460157928529142
0.21469886014625483
0.21479614100721825
0.21489342186818167
0.21499070272914506
0.21508798359010847
0.2151852644510719
0.2152825453120353
0.21537982617299872
0.21547710703396214
0.21557438789492556
0.21567166875588897
0.2157689496168524
0.2158662304778158
0.2159635113387792
0.2160607921997426
0.21615807306070603
0.21625535392166945
0.21635263478263286
0.21644991564359628
0.2165471965045597
0.2166444773655231
0.21674175822648653
0.21683903908744995
0.21693631994841334
0.21703360080937675
0.21713088167034017
0.2172281625313036
0.21732544339226

0.26927342314673014
0.26937070400769353
0.2694679848686569
0.26956526572962036
0.26966254659058375
0.2697598274515472
0.2698571083125106
0.26995438917347403
0.2700516700344374
0.27014895089540086
0.27024623175636425
0.27034351261732764
0.2704407934782911
0.2705380743392545
0.2706353552002179
0.2707326360611813
0.27082991692214475
0.27092719778310814
0.2710244786440716
0.271121759505035
0.2712190403659984
0.2713163212269618
0.2714136020879252
0.27151088294888864
0.27160816380985203
0.2717054446708155
0.27180272553177887
0.2719000063927423
0.2719972872537057
0.27209456811466914
0.27219184897563253
0.2722891298365959
0.27238641069755937
0.27248369155852276
0.2725809724194862
0.2726782532804496
0.27277553414141303
0.2728728150023764
0.27297009586333987
0.27306737672430326
0.2731646575852667
0.2732619384462301
0.2733592193071935
0.2734565001681569
0.2735537810291203
0.27365106189008376
0.27374834275104715
0.2738456236120106
0.273942904472974
0.2740401853339374
0.2741374661949008
0.274234747

0.3586745343721076
0.35877181523307106
0.35886909609403445
0.3589663769549979
0.3590636578159613
0.3591609386769247
0.3592582195378881
0.35935550039885156
0.35945278125981495
0.35955006212077834
0.3596473429817418
0.35974462384270517
0.3598419047036686
0.359939185564632
0.36003646642559545
0.36013374728655884
0.3602310281475223
0.36032830900848567
0.36042558986944906
0.3605228707304125
0.3606201515913759
0.36071743245233934
0.3608147133133027
0.36091199417426617
0.36100927503522956
0.361106555896193
0.3612038367571564
0.36130111761811984
0.3613983984790832
0.3614956793400466
0.36159296020101006
0.36169024106197345
0.3617875219229369
0.3618848027839003
0.36198208364486373
0.3620793645058271
0.36217664536679056
0.36227392622775395
0.36237120708871734
0.3624684879496808
0.3625657688106442
0.3626630496716076
0.362760330532571
0.36285761139353445
0.36295489225449784
0.3630521731154613
0.3631494539764247
0.3632467348373881
0.3633440156983515
0.3634412965593149
0.36353857742027834
0.363635858

0.4070231222709244
0.40712040313188785
0.40721768399285124
0.4073149648538147
0.4074122457147781
0.40750952657574147
0.4076068074367049
0.4077040882976683
0.40780136915863174
0.40789865001959513
0.4079959308805586
0.40809321174152197
0.4081904926024854
0.4082877734634488
0.4083850543244122
0.40848233518537563
0.408579616046339
0.40867689690730247
0.40877417776826586
0.4088714586292293
0.4089687394901927
0.40906602035115613
0.4091633012121195
0.40926058207308297
0.40935786293404636
0.40945514379500975
0.4095524246559732
0.4096497055169366
0.4097469863779
0.4098442672388634
0.40994154809982686
0.41003882896079025
0.4101361098217537
0.4102333906827171
0.41033067154368047
0.4104279524046439
0.4105252332656073
0.41062251412657075
0.41071979498753414
0.4108170758484976
0.41091435670946097
0.4110116375704244
0.4111089184313878
0.41120619929235125
0.41130348015331464
0.411400761014278
0.41149804187524147
0.41159532273620486
0.4116926035971683
0.4117898844581317
0.41188716531909514
0.4119844461

0.4692828732875094
0.46938015414847284
0.46947743500943623
0.4695747158703997
0.46967199673136306
0.4697692775923265
0.4698665584532899
0.46996383931425334
0.47006112017521673
0.4701584010361801
0.47025568189714356
0.47035296275810695
0.4704502436190704
0.4705475244800338
0.47064480534099723
0.4707420862019606
0.47083936706292406
0.47093664792388745
0.47103392878485084
0.4711312096458143
0.4712284905067777
0.4713257713677411
0.4714230522287045
0.47152033308966795
0.47161761395063134
0.4717148948115948
0.4718121756725582
0.4719094565335216
0.472006737394485
0.4721040182554484
0.47220129911641184
0.47229857997737523
0.4723958608383387
0.47249314169930207
0.4725904225602655
0.4726877034212289
0.47278498428219234
0.47288226514315573
0.4729795460041191
0.47307682686508257
0.47317410772604596
0.4732713885870094
0.4733686694479728
0.47346595030893623
0.4735632311698996
0.47366051203086307
0.47375779289182646
0.4738550737527899
0.4739523546137533
0.4740496354747167
0.4741469163356801
0.4742441

0.5353365778816676
0.5354338587426309
0.5355311396035944
0.5356284204645578
0.5357257013255212
0.5358229821864846
0.535920263047448
0.5360175439084115
0.5361148247693749
0.5362121056303383
0.5363093864913017
0.5364066673522652
0.5365039482132286
0.5366012290741919
0.5366985099351553
0.5367957907961187
0.5368930716570822
0.5369903525180456
0.537087633379009
0.5371849142399724
0.5372821951009359
0.5373794759618993
0.5374767568228627
0.5375740376838261
0.5376713185447894
0.537768599405753
0.5378658802667163
0.5379631611276797
0.5380604419886431
0.5381577228496066
0.53825500371057
0.5383522845715334
0.5384495654324968
0.5385468462934603
0.5386441271544237
0.5387414080153871
0.5388386888763504
0.5389359697373138
0.5390332505982773
0.5391305314592407
0.5392278123202041
0.5393250931811675
0.539422374042131
0.5395196549030944
0.5396169357640578
0.5397142166250212
0.5398114974859846
0.5399087783469481
0.5400060592079114
0.5401033400688748
0.5402006209298382
0.5402979017908017
0.5403951826517651

0.5920513198233379
0.5921486006843014
0.5922458815452648
0.5923431624062282
0.5924404432671916
0.592537724128155
0.5926350049891185
0.5927322858500819
0.5928295667110453
0.5929268475720086
0.5930241284329721
0.5931214092939355
0.5932186901548989
0.5933159710158623
0.5934132518768257
0.5935105327377892
0.5936078135987526
0.593705094459716
0.5938023753206794
0.5938996561816429
0.5939969370426063
0.5940942179035696
0.594191498764533
0.5942887796254965
0.5943860604864599
0.5944833413474233
0.5945806222083867
0.5946779030693501
0.5947751839303136
0.594872464791277
0.5949697456522404
0.5950670265132038
0.5951643073741673
0.5952615882351306
0.595358869096094
0.5954561499570574
0.5955534308180208
0.5956507116789843
0.5957479925399477
0.5958452734009111
0.5959425542618745
0.596039835122838
0.5961371159838014
0.5962343968447648
0.5963316777057281
0.5964289585666915
0.596526239427655
0.5966235202886184
0.5967208011495818
0.5968180820105452
0.5969153628715087
0.5970126437324721
0.5971099245934355


0.6562566880591912
0.6563539689201546
0.656451249781118
0.6565485306420815
0.6566458115030449
0.6567430923640083
0.6568403732249717
0.6569376540859352
0.6570349349468986
0.657132215807862
0.6572294966688254
0.6573267775297887
0.6574240583907522
0.6575213392517156
0.657618620112679
0.6577159009736424
0.6578131818346059
0.6579104626955693
0.6580077435565327
0.6581050244174961
0.6582023052784595
0.658299586139423
0.6583968670003864
0.6584941478613497
0.6585914287223131
0.6586887095832766
0.65878599044424
0.6588832713052034
0.6589805521661668
0.6590778330271302
0.6591751138880937
0.6592723947490571
0.6593696756100205
0.6594669564709839
0.6595642373319474
0.6596615181929107
0.6597587990538741
0.6598560799148375
0.659953360775801
0.6600506416367644
0.6601479224977278
0.6602452033586912
0.6603424842196546
0.6604397650806181
0.6605370459415815
0.6606343268025449
0.6607316076635082
0.6608288885244717
0.6609261693854351
0.6610234502463985
0.6611207311073619
0.6612180119683253
0.6613152928292888


0.7145279237762763
0.7146252046372397
0.714722485498203
0.7148197663591666
0.7149170472201299
0.7150143280810933
0.7151116089420567
0.7152088898030201
0.7153061706639836
0.715403451524947
0.7155007323859104
0.7155980132468738
0.7156952941078373
0.7157925749688007
0.715889855829764
0.7159871366907274
0.7160844175516908
0.7161816984126543
0.7162789792736177
0.7163762601345811
0.7164735409955445
0.716570821856508
0.7166681027174714
0.7167653835784348
0.7168626644393982
0.7169599453003616
0.7170572261613251
0.7171545070222884
0.7172517878832518
0.7173490687442152
0.7174463496051787
0.7175436304661421
0.7176409113271055
0.7177381921880689
0.7178354730490323
0.7179327539099958
0.7180300347709592
0.7181273156319226
0.718224596492886
0.7183218773538494
0.7184191582148128
0.7185164390757762
0.7186137199367396
0.7187110007977031
0.7188082816586665
0.7189055625196299
0.7190028433805933
0.7191001242415567
0.7191974051025202
0.7192946859634836
0.719391966824447
0.7194892476854103
0.7195865285463738

0.7574287834611418
0.7575260643221052
0.7576233451830687
0.7577206260440321
0.7578179069049955
0.7579151877659589
0.7580124686269223
0.7581097494878858
0.7582070303488492
0.7583043112098126
0.758401592070776
0.7584988729317395
0.7585961537927028
0.7586934346536662
0.7587907155146296
0.7588879963755931
0.7589852772365565
0.7590825580975199
0.7591798389584833
0.7592771198194467
0.7593744006804102
0.7594716815413736
0.759568962402337
0.7596662432633003
0.7597635241242638
0.7598608049852272
0.7599580858461906
0.760055366707154
0.7601526475681174
0.7602499284290809
0.7603472092900443
0.7604444901510077
0.7605417710119711
0.7606390518729346
0.760736332733898
0.7608336135948613
0.7609308944558247
0.7610281753167881
0.7611254561777516
0.761222737038715
0.7613200178996784
0.7614172987606418
0.7615145796216053
0.7616118604825687
0.7617091413435321
0.7618064222044955
0.7619037030654588
0.7620009839264223
0.7620982647873857
0.7621955456483491
0.7622928265093125
0.762390107370276
0.7624873882312394

0.800232362285044
0.8003296431460074
0.8004269240069709
0.8005242048679343
0.8006214857288977
0.8007187665898611
0.8008160474508246
0.800913328311788
0.8010106091727514
0.8011078900337147
0.8012051708946781
0.8013024517556416
0.801399732616605
0.8014970134775684
0.8015942943385318
0.8016915751994953
0.8017888560604587
0.8018861369214221
0.8019834177823855
0.8020806986433489
0.8021779795043124
0.8022752603652757
0.8023725412262391
0.8024698220872025
0.802567102948166
0.8026643838091294
0.8027616646700928
0.8028589455310562
0.8029562263920196
0.8030535072529831
0.8031507881139465
0.8032480689749099
0.8033453498358732
0.8034426306968367
0.8035399115578001
0.8036371924187635
0.8037344732797269
0.8038317541406904
0.8039290350016538
0.8040263158626172
0.8041235967235806
0.804220877584544
0.8043181584455075
0.8044154393064709
0.8045127201674342
0.8046100010283976
0.8047072818893611
0.8048045627503245
0.8049018436112879
0.8049991244722513
0.8050964053332147
0.8051936861941782
0.805290967055141

0.8530558697881778
0.8531531506491413
0.8532504315101047
0.8533477123710681
0.8534449932320315
0.853542274092995
0.8536395549539584
0.8537368358149218
0.8538341166758852
0.8539313975368485
0.854028678397812
0.8541259592587754
0.8542232401197388
0.8543205209807022
0.8544178018416657
0.8545150827026291
0.8546123635635925
0.8547096444245559
0.8548069252855194
0.8549042061464828
0.8550014870074462
0.8550987678684095
0.8551960487293729
0.8552933295903364
0.8553906104512998
0.8554878913122632
0.8555851721732266
0.8556824530341901
0.8557797338951535
0.8558770147561169
0.8559742956170803
0.8560715764780437
0.8561688573390072
0.8562661381999706
0.8563634190609339
0.8564606999218973
0.8565579807828608
0.8566552616438242
0.8567525425047876
0.856849823365751
0.8569471042267144
0.8570443850876779
0.8571416659486413
0.8572389468096047
0.857336227670568
0.8574335085315316
0.8575307893924949
0.8576280702534583
0.8577253511144217
0.8578226319753851
0.8579199128363486
0.858017193697312
0.858114474558275

0.8958594486120801
0.8959567294730435
0.8960540103340069
0.8961512911949703
0.8962485720559337
0.8963458529168972
0.8964431337778606
0.896540414638824
0.8966376954997873
0.8967349763607508
0.8968322572217142
0.8969295380826776
0.897026818943641
0.8971240998046044
0.8972213806655679
0.8973186615265313
0.8974159423874947
0.8975132232484581
0.8976105041094216
0.897707784970385
0.8978050658313483
0.8979023466923117
0.8979996275532751
0.8980969084142386
0.898194189275202
0.8982914701361654
0.8983887509971288
0.8984860318580923
0.8985833127190557
0.8986805935800191
0.8987778744409824
0.898875155301946
0.8989724361629093
0.8990697170238727
0.8991669978848361
0.8992642787457995
0.899361559606763
0.8994588404677264
0.8995561213286898
0.8996534021896532
0.8997506830506167
0.8998479639115801
0.8999452447725435
0.9000425256335068
0.9001398064944702
0.9002370873554337
0.9003343682163971
0.9004316490773605
0.9005289299383239
0.9006262107992874
0.9007234916602508
0.9008207725212142
0.9009180533821776

0.989832760302738
0.9899300411637014
0.9900273220246648
0.9901246028856283
0.9902218837465917
0.9903191646075551
0.9904164454685185
0.990513726329482
0.9906110071904454
0.9907082880514088
0.9908055689123721
0.9909028497733356
0.991000130634299
0.9910974114952624
0.9911946923562258
0.9912919732171892
0.9913892540781527
0.9914865349391161
0.9915838158000795
0.9916810966610429
0.9917783775220064
0.9918756583829698
0.9919729392439331
0.9920702201048965
0.9921675009658599
0.9922647818268234
0.9923620626877868
0.9924593435487502
0.9925566244097136
0.9926539052706771
0.9927511861316405
0.9928484669926039
0.9929457478535673
0.9930430287145307
0.9931403095754942
0.9932375904364575
0.9933348712974209
0.9934321521583843
0.9935294330193478
0.9936267138803112
0.9937239947412746
0.993821275602238
0.9939185564632014
0.9940158373241649
0.9941131181851283
0.9942103990460917
0.994307679907055
0.9944049607680185
0.9945022416289819
0.9945995224899453
0.9946968033509087
0.9947940842118722
0.994891365072835

In [32]:
#vectorize likely percentage
percentage_like_vec = []
for index in range(len(percent_like)):
    str_com = str(percent_like[index]).replace('-','').replace('[','').replace(']', '').replace('\'' , '').replace(',','')
    word = word_tokenize(str_com)
    for i in range(len(word)):
        try:
            if len(word) == 1:
                percentage_like_vec.append(model[str(word[0])])
            if len(word) == 2:
                percentage_like_vec.append(model[str(word[0])]+model[str(word[1])])
            elif len(word) == 3:
                percentage_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])])
            elif len(word) == 4:
                percentage_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])])
            #elif len(word) == 5:
                #com_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])])
            #elif len(word) == 6:
                #com_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])]+model[str(word[5])])
        except KeyError:
            pass
        continue

  # This is added back by InteractiveShellApp.init_path()
  from ipykernel import kernelapp as app
  del sys.path[0]


In [30]:
#Find likely Company

company_like = []

for i in range(articles.shape[0]):
    if i % 70000 == 0:  #progress index
        print(i/719566)
    match_find_com1 = re.findall(r"(?=([A-Z][a-z]+))", str(articles.iloc[i:i+1,0]))
    match_find_com2 = re.findall(r"(?=([A-Z][a-z]+ [A-Z][a-z]+))", str(articles.iloc[i:i+1,0]))
    match_find_com3 = re.findall(r"(?=([A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+))", str(articles.iloc[i:i+1,0]))
    match_find_com4 = re.findall(r"(?=([A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+))", str(articles.iloc[i:i+1,0]))
    
    if match_find_com1:
        company_like.append(match_find_com1)
    if match_find_com2:
        company_like.append(match_find_com2)
    if match_find_com3:
        company_like.append(match_find_com3)
    if match_find_com4:
        company_like.append(match_find_com4)

0.0
0.09728086096341405
0.1945617219268281
0.2918425828902422
0.3891234438536562
0.4864043048170703
0.5836851657804843
0.6809660267438984
0.7782468877073124
0.8755277486707265
0.9728086096341406


In [33]:
#Vectorize likely company

com_like_vec = []
for index in range(len(company_like)):
    str_com = str(company_like[index]).replace('-','').replace('[','').replace(']', '').replace('\'' , '').replace(',','')
    word = word_tokenize(str_com)
    for i in range(len(word)):
        try:
            if len(word) == 1:
                com_like_vec.append(model[str(word[0])])
            if len(word) == 2:
                com_like_vec.append(model[str(word[0])]+model[str(word[1])])
            elif len(word) == 3:
                com_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])])
            elif len(word) == 4:
                com_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])])
            #elif len(word) == 5:
                #com_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])])
            #elif len(word) == 6:
                #com_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])]+model[str(word[5])])
        except KeyError:
            pass
        continue

  app.launch_new_instance()
  if sys.path[0] == '':
  
  # Remove the CWD from sys.path while we load stuff.


In [34]:
#Find likely CEO

ceo_like = []

for i in range(articles.shape[0]):
    if i % 70000 == 0:  #progress index
        print(i/719566)
    #match_find_com1 = re.findall(r"(?=([A-Z][a-z]+))", str(articles.iloc[i:i+1,0]))
    match_find_com2 = re.findall(r"(?=([A-Z][a-z]+ [A-Z][a-z]+))", str(articles.iloc[i:i+1,0]))
    match_find_com3 = re.findall(r"(?=([A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+))", str(articles.iloc[i:i+1,0]))
    #match_find_com4 = re.findall(r"(?=([A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+))", str(articles.iloc[i:i+1,0]))
    
    #if match_find_com1:
        #company_like.append(match_find_com1)
    if match_find_com2:
        ceo_like.append(match_find_com2)
    if match_find_com3:
        ceo_like.append(match_find_com3)
    #if match_find_com4:
        #company_like.append(match_find_com4)

0.0
0.09728086096341405
0.1945617219268281
0.2918425828902422
0.3891234438536562
0.4864043048170703
0.5836851657804843
0.6809660267438984
0.7782468877073124
0.8755277486707265
0.9728086096341406


In [35]:
ceo_like_vec = []
for index in range(len(company_like)):
    str_com = str(company_like[index]).replace('-','').replace('[','').replace(']', '').replace('\'' , '').replace(',','')
    word = word_tokenize(str_com)
    for i in range(len(word)):
        try:
            #if len(word) == 1:
                #com_like_vec.append(model[str(word[0])])
            if len(word) == 2:
                ceo_like_vec.append(model[str(word[0])]+model[str(word[1])])
            elif len(word) == 3:
                ceo_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])])
            #elif len(word) == 4:
                #com_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])])
            #elif len(word) == 5:
                #com_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])])
            #elif len(word) == 6:
                #com_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])]+model[str(word[5])])
        except KeyError:
            pass
        continue

  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':


In [39]:
percent_like_vec = []
for index in range(len(match_percent)):
    str_com = str(match_percent[index]).replace('-','').replace('[','').replace(']', '').replace('\'' , '').replace(',','')
    word = word_tokenize(str_com)
    for i in range(len(word)):
        try:
            if len(word) == 1:
                percent_like_vec.append(model[str(word[0])])
            if len(word) == 2:
                percent_like_vec.append(model[str(word[0])]+model[str(word[1])])
            elif len(word) == 3:
                percent_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])])
            elif len(word) == 4:
                percent_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])])
            elif len(word) == 5:
                percent_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])])
            elif len(word) == 6:
                percent_like_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]+model[str(word[3])]+model[str(word[4])]+model[str(word[5])])
        except KeyError:
            pass
        continue

  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':


In [37]:
print ("Len of percentage_like list"+str(len(percentage_like_vec)))

Len of percentage_like66242


In [38]:
print ("Len of Company_like_vec list"+str(len(com_like_vec)))

Len of Company_like1689408


In [39]:
print ("Len of CEO_like_vec list"+str(len(ceo_like_vec)))

Len of CEO_like_vec list1335590


### Find negative labels

In [40]:
Adjective_sentence = open('./all_sentences.csv',encoding='utf-8',errors='ignore')
pos_sentence = Adjective_sentence.readlines()

In [42]:
#Use Pos Tag to find Capitalied Adjective, treat as negative label

Adjective = []
Verb = []
Determiner = []
for index in range (len(pos_sentence)):
    if index % 71900 == 0: #progress index
        print (index/719566)
    
    match_upper = re.findall('[A-Z]\w+',str(pos_sentence[index]).replace('-','').replace('[','').replace(']', '').replace('\'' , '').replace(',',''))
    word_token = word_tokenize(str(match_upper))
    sent_tagged = pos_tag(word_token)
    for i in range (len(sent_tagged)):
        if sent_tagged[i][1] == 'JJ':
            Adjective.append(sent_tagged[i][0])
        if sent_tagged[i][1] == 'VB':
            Verb.append(sent_tagged[i][0])
        if sent_tagged[i][1] == 'WDT':
            Determiner.appen(sent_tagged[i][0])

0.0
0.09992134147527815
0.1998426829505563
0.29976402442583444
0.3996853659011126
0.4996067073763908
0.5995280488516689
0.6994493903269471
0.7993707318022252
0.8992920732775034
0.9992134147527816


In [43]:
#Vectorize Capitalied Adjective

adjective_vec = []
for index in range(len(Adjective)):
    str_com = str(Adjective[index]).replace('-','').replace('[','').replace(']', '').replace('\'' , '').replace(',','')
    word = word_tokenize(str_com)
    for i in range(len(word)):
        try:
            if len(word) == 1:
                adjective_vec.append(model[str(word[0])])
            if len(word) == 2:
                adjective.append(model[str(word[0])]+model[str(word[1])])
            elif len(word) == 3:
                adjective_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]) 
        except KeyError:
            pass
        continue

  # Remove the CWD from sys.path while we load stuff.


In [44]:
#Vectorize VERB

verb_vec = []
for index in range(len(Verb)):
    str_com = str(Verb[index]).replace('-','').replace('[','').replace(']', '').replace('\'' , '').replace(',','')
    word = word_tokenize(str_com)
    for i in range(len(word)):
        try:
            if len(word) == 1:
                verb_vec.append(model[str(word[0])])
            if len(word) == 2:
                verb_vec.append(model[str(word[0])]+model[str(word[1])])
            elif len(word) == 3:
                verb_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]) 
        except KeyError:
            pass
        continue

  # Remove the CWD from sys.path while we load stuff.


In [45]:
#Vectorize Determiner

determiner_vec = []
for index in range(len(Determiner)):
    str_com = str(Determiner[index]).replace('-','').replace('[','').replace(']', '').replace('\'' , '').replace(',','')
    word = word_tokenize(str_com)
    for i in range(len(word)):
        try:
            if len(word) == 1:
                determiner_vec.append(model[str(word[0])])
            if len(word) == 2:
                determiner_vec.append(model[str(word[0])]+model[str(word[1])])
            elif len(word) == 3:
                determiner_vec.append(model[str(word[0])]+model[str(word[1])]+model[str(word[2])]) 
        except KeyError:
            pass
        continue

### Positive and Negative data for Company

In [46]:
#positive data= Company csv labeled data
#negative data= CEO csv labeled data & capitalied adjective,verb,determiner
pos_data_company = company_vec.copy()
neg_data_company = ceo_vec+adjective_vec+verb_vec+determiner_vec

In [47]:
#Add label and transfer to Dataframe
pos_data_company  = pd.DataFrame(pos_data_company)
pos_data_company['100'] = 1
neg_data_company  = pd.DataFrame(neg_data_company)
neg_data_company['100'] = 0

In [103]:
#Shuffle data and pick top 8201 lines
neg_data_company = shuffle(neg_data_company)

In [107]:
neg_data_company = neg_data_company[0:8201]

In [108]:
Train_data_company = pd.concat([pos_data_company,neg_data_company],join='outer')

In [16]:
from sklearn.utils import shuffle

In [109]:
#Shuffle data
Train_data_company = shuffle(Train_data_company)

In [110]:
#x= feature, y=label
x_values = Train_data_company.drop('100', axis = 1)
y_values = Train_data_company['100']

In [111]:
x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.5)

### Positive and Negative data for CEO

In [55]:
#positive data= CEO csv labeled data
#negative data= Company csv labeled data 
pos_data_ceo = ceo_vec.copy()
neg_data_ceo = company_vec.copy()

In [56]:
#Add label
pos_data_ceo = pd.DataFrame(pos_data_ceo)
pos_data_ceo['100'] = 1
neg_data_ceo = pd.DataFrame(neg_data_ceo)
neg_data_ceo['100'] = 0

In [120]:
neg_data_ceo = shuffle(neg_data_ceo)

In [122]:
neg_data_ceo = neg_data_ceo[0:3955]

In [123]:
Train_data_ceo = pd.concat([pos_data_ceo,neg_data_ceo],join='outer')

In [124]:
#Shuffle data
Train_data_ceo = shuffle(Train_data_ceo)

In [125]:
x_values_ceo = Train_data_ceo.drop('100', axis = 1)
y_values_ceo = Train_data_ceo['100']

In [126]:
x_train_ceo, x_test_ceo, y_train_ceo, y_test_ceo = train_test_split(x_values_ceo, y_values_ceo, test_size=0.3)

# Extract Company & CEO_Part 2

### Naive Bayes classifier

In [23]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### Predict Company data

In [112]:
clf = GaussianNB()

In [113]:
#train clssifier
clf.fit(x_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [114]:
#predict
company_predict_train = clf.predict(x_train)

In [115]:
print("Accuracy：")
clf.score(x_test,y_test)

Accuracy：


0.887452749664675

In [72]:
com_like_df = pd.DataFrame(com_like_vec[1:906167])

In [73]:
clf_predict = clf.predict(com_like_df)

In [74]:
bayes_company = []

In [75]:
#Go through company like words in TXT
for index, classification in enumerate(clf_predict):
    if classification == 1:
        bayes_company.append(company_like[index][0])

In [76]:
found_companies_bayes = set(bayes_company)
output_bayes = set(list(found_companies_bayes))

In [77]:
#remove Stop words
stop_words=sorted(set(stopwords.words("english")))
filtered_output_bayes = [word for word in output_bayes if word not in stop_words]

In [78]:
#add predict and known campany name together
final_company_bayes = set(list(filtered_output_bayes) + list(company_list))

In [108]:
pd.Series(list(final_company_bayes)).to_csv("final_company_bayes1.csv", header = False, index = False)

### Predict CEO data

In [127]:
clf_ceo = GaussianNB()

In [128]:
x_train_ceo, x_test_ceo, y_train_ceo, y_test_ceo = train_test_split(x_values_ceo, y_values_ceo, test_size=0.3)

In [129]:
clf_ceo.fit(x_train_ceo,y_train_ceo)

GaussianNB(priors=None, var_smoothing=1e-09)

In [130]:
clf_ceo_fit = clf_ceo.predict(x_train_ceo)

In [131]:
clf_ceo.score(x_train_ceo,y_train_ceo)

0.8679790500270905

In [132]:
len(ceo_like_vec[0:173610])

173610

In [133]:
ceo_like_df = pd.DataFrame(ceo_like_vec[0:173610])

In [134]:
clf_ceo_predict = clf_ceo.predict(ceo_like_df)

In [87]:
bayes_ceo = []

In [88]:
for index, classification in enumerate(clf_ceo_predict):
    if classification == 1:
        bayes_ceo.append(ceo_like[index][0])

In [89]:
found_ceo_bayes = set(bayes_ceo)
output_bayes = set(list(found_ceo_bayes))

In [90]:
filtered_output_bayes_ceo = [word for word in output_bayes if word not in stop_words]

In [91]:
final_ceo_bayes = set(list(filtered_output_bayes_ceo) + list(ceo_list))

In [145]:
pd.Series(list(final_ceo_bayes)).to_csv("final_ceo_bayes.csv", header = False, index = False)

### Additional Information_Percentage Verification

In [None]:
#positive data= Percent csv labeled data
#negative data= ceo and company csv labeled data
pos_data_percent = percent_vec.copy()
neg_data_percent = ceo_vec+company_vec

In [14]:
len(neg_data_percent)

12156

In [15]:
#Add label
pos_data_percent = pd.DataFrame(pos_data_percent)
pos_data_percent['100'] = 1
neg_data_percent = pd.DataFrame(neg_data_percent)
neg_data_percent['100'] = 0

In [17]:
neg_data_percent = shuffle(neg_data_percent)

In [18]:
neg_data_percent = neg_data_percent[0:5699]

In [19]:
Train_data_percent = pd.concat([pos_data_percent,neg_data_percent],join='outer')

In [20]:
#Shuffle data
Train_data_percent = shuffle(Train_data_percent)

In [21]:
x_values_percent = Train_data_percent.drop('100', axis = 1)
y_values_percent = Train_data_percent['100']

In [22]:
x_train_percent, x_test_percent, y_train_percent, y_test_percent = train_test_split(x_values_percent, y_values_percent, test_size=0.3)

In [24]:
clf_percent = GaussianNB()

In [25]:
clf_percent.fit(x_train_percent,y_train_percent)

GaussianNB(priors=None, var_smoothing=1e-09)

In [26]:
clf_percent.score(x_test_percent,y_test_percent)

0.9374269005847953

In [42]:
clf_percent_predict = clf_percent.predict(percent_like_vec)

In [48]:
count = 0
for i in range(len(clf_percent_predict)):
    if clf_percent_predict[i] == 0:
        count+=1
print (count)

1876


In [49]:
len(match_percent)

78953