In [1]:
import numpy as np
import pandas as pd

In [2]:
Health_care_news = pd.read_csv("C:/Users/thisi/Workspace/AI_dataset/HPC_data/health_care_news_data.csv")

In [3]:
Health_care_news.shape

(8577, 12)

In [4]:
Health_care_news.description.head()

0    2017-12-14 00:00:00\n\nGlobal program to comme...
1    2017-12-13 00:00:00\n\nPfizer-Developed Biosim...
2    2017-12-13 00:00:00\n\nWednesday, December 13,...
3    2017-12-12 00:00:00\n\nTuesday, December 12, 2...
4    2017-12-11 00:00:00\n\nData include eleventh p...
Name: description, dtype: object

>  ### Step 1: Considered only 100 articles as we need more memory to process the entire dataset (8577 articles) 

In [5]:
health_care_desc = Health_care_news["description"].head(100)

In [6]:
print(health_care_desc.head(5))

0    2017-12-14 00:00:00\n\nGlobal program to comme...
1    2017-12-13 00:00:00\n\nPfizer-Developed Biosim...
2    2017-12-13 00:00:00\n\nWednesday, December 13,...
3    2017-12-12 00:00:00\n\nTuesday, December 12, 2...
4    2017-12-11 00:00:00\n\nData include eleventh p...
Name: description, dtype: object


In [7]:
type(health_care_desc)

pandas.core.series.Series

> ### Step 2: Data Cleaning

> **Common data cleaning steps on all text using Regex function:**
> * Make text all lower case
> * Remove punctuation
> * Remove numerical values
> * Remove common non-sensical text (/n)
> * Tokenize text
> * Remove stop words

In [8]:
import re 
import string

In [9]:
def clean_text(row):
    sample_text = re.sub('[\t\n\r\f\v\"]', '',row)
    sample_text = re.sub(r'Contact:[\s\w,\-\d\[\]\:]*', '',sample_text)
    sample_text = re.sub('(\d{4})-(\d{2})-(\d{2}).(\d{2})\:(\d{2})\:(\d{2})', '', sample_text)
    sample_text = re.sub('\[.*?\]', '', sample_text) # remove text in square brackets
    sample_text = re.sub('\w*\d\w*', '', sample_text) # remove words containing numbers
    return sample_text

In [10]:
health_care_desc = health_care_desc.apply(lambda x:clean_text(x))

In [11]:
health_care_desc.head()

0    Global program to commence with pivotal study ...
1    Pfizer-Developed Biosimilar Medicine IXIFITM (...
2    Wednesday, December ,  - : Inc. (NYSE: PFE) to...
3    Tuesday, December ,  - : Inc. (NYSE: PFE) anno...
4    Data include eleventh participant in the trial...
Name: description, dtype: object

In [12]:
health_care_desc[3]



In [13]:
import spacy
nlp = spacy.load('en_core_web_sm')

spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

def remove_stop_word_tokenize(row):
    doc = nlp(row)
    tokens = [token.text for token in doc if not token.is_stop]
    return " ".join(tokens)

In [14]:
health_care_desc = health_care_desc.apply(lambda x:remove_stop_word_tokenize(x))

In [15]:
health_care_desc.head()

0    Global program commence pivotal study   North ...
1    Pfizer - Developed Biosimilar Medicine IXIFITM...
2    Wednesday , December ,   - : Inc. ( NYSE : PFE...
3    Tuesday , December ,   - : Inc. ( NYSE : PFE )...
4    Data include eleventh participant trialAnnuali...
Name: description, dtype: object

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1, 1), min_df=2)
X = tfidf.fit_transform(health_care_desc).toarray()

In [17]:
X.shape

(100, 4259)

In [18]:
from sklearn.decomposition import TruncatedSVD

In [19]:
lsa = TruncatedSVD(n_components=20, n_iter=100)

In [20]:
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=100,
       random_state=None, tol=0.0)

In [21]:
type(lsa.components_)

numpy.ndarray

In [22]:
lsa.components_.shape

(20, 4259)

In [23]:
lsa.components_[0].shape

(4259,)

In [24]:
terms = tfidf.get_feature_names()
for i, component in enumerate(lsa.components_):
    print(i, component)

0 [0.002147   0.00102765 0.00102713 ... 0.02057985 0.0016624  0.0054612 ]
1 [-0.00093543 -0.00057637 -0.00071028 ... -0.00476624 -0.00082469
  0.01884727]
2 [ 1.55611761e-03  3.40977585e-03 -1.11862799e-03 ...  1.04740081e-02
  3.86021316e-05 -5.50825933e-04]
3 [-3.74589145e-03  1.69677939e-03 -9.19973446e-06 ... -1.67948447e-02
 -1.70080541e-03  1.57713976e-03]
4 [-1.43815104e-03 -9.49093537e-04  2.80602423e-03 ... -3.91008171e-03
  1.21243178e-03  7.78987802e-05]
5 [-0.00012587 -0.00015555 -0.0026217  ... -0.00787145 -0.00157148
  0.00049813]
6 [-0.00466698 -0.00026494  0.00398391 ... -0.01256834  0.00069119
  0.00060874]
7 [-7.74077329e-03  5.30853906e-05 -2.01705695e-03 ... -7.50852082e-03
 -4.10137182e-03 -9.20686725e-06]
8 [ 2.21581651e-02 -1.46093693e-03 -3.74755957e-03 ... -1.33631613e-02
  6.66264519e-03 -7.10853635e-05]
9 [ 4.33074898e-03  4.82036987e-05  8.45215903e-03 ... -7.19933463e-03
  6.34286026e-03  6.15088786e-04]
10 [ 7.94504743e-04 -1.68742688e-03 -2.41165026e-04 .

In [25]:
terms = tfidf.get_feature_names()
for i, component in enumerate(lsa.components_):
    terms_components = zip(terms, component)
    if i == 0:
        for t in terms_components:
            print(t)
        break;

('aav', 0.002147004733888516)
('abbott', 0.0010276474240139895)
('abc', 0.0010271334753984471)
('abdominal', 0.009014862883028425)
('ability', 0.02072719076135108)
('able', 0.012147413825710779)
('abnormal', 0.003246332394605205)
('abnormalities', 0.011325001328044168)
('abnormalitieslymphocyte', 0.0007969316383601412)
('abnormalitiestreatment', 0.0007969316383601412)
('about', 0.018939519561394995)
('abr', 0.003007591859048735)
('abrazo', 0.0074020257348211315)
('absence', 0.0038726311605621804)
('absolute', 0.001358555039038819)
('absorbed', 0.001327875420341536)
('abstr', 0.0007825405908914203)
('abstract', 0.013990950810062564)
('abstracts', 0.011606707104169906)
('ac', 0.0007549717220708056)
('academic', 0.0095030605829862)
('academy', 0.004177491675377632)
('acc', 0.004590176899044394)
('accelerate', 0.005705985040146042)
('accelerated', 0.013754135551222114)
('accelerates', 0.0009683715343670218)
('acceptance', 0.002807958241045714)
('accepted', 0.006493158544725783)
('access', 

('blisters', 0.001399343251779287)
('blocker', 0.0008491044020617873)
('blockers', 0.0008491044020617873)
('blocking', 0.0038605645602460396)
('blocks', 0.000812483446364183)
('blood', 0.027129608726219466)
('bloodstream', 0.0022728665815132253)
('blue', 0.005572221993154193)
('blurred', 0.00168877613576497)
('bmc', 0.0016186468438885917)
('bms', 0.00905206427479016)
('bmy', 0.0024808903793168504)
('board', 0.01177824263514088)
('body', 0.010229391430554853)
('bold', 0.0012231102942266779)
('bone', 0.0019832500100167706)
('book', 0.0016359332330626965)
('booklet', 0.0010341221162794064)
('bor', 0.002322591608418342)
('borg', 0.000584289046303383)
('boshoff', 0.00169747208003169)
('bosulif', 0.020598255365147276)
('bosutinib', 0.006460973624953164)
('boundaries', 0.0015360623928524226)
('bourla', 0.0075442943695467665)
('bowel', 0.007116369009185798)
('bowen', 0.000584289046303383)
('boxed', 0.006717188284891055)
('bpm', 0.0016287599330051326)
('bradyarrhythmias', 0.0008143799665025663)

('creatinine', 0.003283659876845201)
('credit', 0.003212748785702901)
('cresemba', 0.043609260140288916)
('cri', 0.0028831571297502226)
('crisis', 0.0029056916934888464)
('criteria', 0.0056259785023764055)
('critical', 0.010189041867771647)
('crizotinib', 0.007582903475719295)
('crohn', 0.010378041262097439)
('crohns', 0.002240493449770543)
('crohnscolitisfoundation', 0.0011800751069719056)
('cross', 0.005074250124977988)
('crossrefmedline', 0.001327875420341536)
('crowe', 0.012440271854196194)
('crown', 0.0050677702969801255)
('crpc', 0.008296192510340934)
('crucial', 0.0013496590014646607)
('cryptococcosis', 0.0007969316383601412)
('crystals', 0.003917144189725838)
('ctla', 0.0015447668331595127)
('cumulative', 0.0023129139120645504)
('curative', 0.0008781555611071638)
('cure', 0.013093995686253026)
('cures', 0.021929418930567577)
('currency', 0.002989415392933099)
('current', 0.02041292729497941)
('currently', 0.02116141499790564)
('custom', 0.0037628435507857786)
('customers', 0.00

('evaluation', 0.009411345083051716)
('evaluations', 0.001423915492362969)
('event', 0.002649168084277684)
('events', 0.0372227569591971)
('evers', 0.0016186468438885917)
('everyday', 0.0020922246690799557)
('evidence', 0.012166446198357198)
('evolve', 0.0030045099779320876)
('evolving', 0.006663815423948331)
('exact', 0.002462519778978322)
('exaggerated', 0.00122013942267337)
('examination', 0.0007969316383601412)
('examinations', 0.00132394752991564)
('examined', 0.0010602127101926526)
('examining', 0.0008143799665025663)
('example', 0.004571725399740945)
('exceeded', 0.0023796050264497377)
('excellence', 0.0027096155109785085)
('excellent', 0.0021578253011031562)
('exception', 0.002001104755557602)
('excessive', 0.0028204383698371437)
('exchange', 0.023661861487972363)
('excier', 0.000584289046303383)
('excipients', 0.0009975499850029344)
('excited', 0.0054278509907869895)
('exclude', 0.0031968820930673466)
('excluded', 0.0038906839194960134)
('excluding', 0.0024202516506114475)
('e

('identifier', 0.0022630281257151417)
('identifies', 0.0020350908092255336)
('identify', 0.0032133008643975705)
('identifying', 0.010868389521669932)
('ierc', 0.0015551924862451385)
('if', 0.0022689787103919684)
('ifds', 0.0018226076189387327)
('ifnα', 0.0026344666833214908)
('ii', 0.01096573558807303)
('iii', 0.011638525702367402)
('il', 0.005766674591285222)
('ild', 0.004886279799015398)
('illness', 0.0025535493102627244)
('illnesses', 0.003686647155460982)
('imaging', 0.0034482818236805067)
('imatinib', 0.004936846431070182)
('imd', 0.000584289046303383)
('immediately', 0.0028371784072713504)
('immune', 0.04823757643124975)
('immunization', 0.005380160323976036)
('immunizations', 0.003414818409909355)
('immuno', 0.013085306252027577)
('immunocompromised', 0.00706506278483488)
('immunoconjugate', 0.0020952931883820433)
('immunogenic', 0.0011177897595958085)
('immunogenicity', 0.0067582622173080285)
('immunologically', 0.0008925326306311553)
('immunology', 0.009366179862353783)
('immu

('london', 0.0015157725180430005)
('long', 0.016280341550929923)
('longer', 0.0037563520540847078)
('longest', 0.001999523124149578)
('look', 0.008346309163969858)
('looking', 0.07274305572068873)
('lopes', 0.001363931761989188)
('lorlatinib', 0.019428569233018726)
('lorlatiniblorlatinib', 0.0015320361574128428)
('loss', 0.007316394841614439)
('lot', 0.001967016883825457)
('louis', 0.014140067773510744)
('loved', 0.0007750784334358824)
('low', 0.012952173699260223)
('lower', 0.015339172358546177)
('lp', 0.0012943487767305853)
('ltd', 0.01695087132357186)
('luciano', 0.002754260541433351)
('luis', 0.00070183995223431)
('lung', 0.03404506222592287)
('lungcancer', 0.0008143799665025663)
('lungs', 0.001587691663415142)
('luo', 0.0007643711110649714)
('lupus', 0.010241695855600978)
('lymphoblastic', 0.024159675091237127)
('lymphocyte', 0.0033443045538335426)
('lymphocytes', 0.0014287287842968967)
('lymphocytic', 0.00423914485201222)
('lymphocytosis', 0.0007969316383601412)
('lymphoma', 0.01

('organized', 0.0034405464061542843)
('organs', 0.002777159521413116)
('original', 0.005968787584034547)
('originally', 0.0016059028155512856)
('originates', 0.003071444115329168)
('originator', 0.0019763764047615005)
('orinvestors', 0.0015721339753217113)
('orphan', 0.006237614605937957)
('orr', 0.015244808818314994)
('os', 0.005292742810997755)
('osteonecrosis', 0.0012057378990245945)
('our', 0.0018334942337001404)
('outcome', 0.008314634326206199)
('outcomes', 0.015322692573942137)
('outlook', 0.0024785284695361463)
('outpatient', 0.0008282553466464432)
('outside', 0.012142378478533918)
('outstanding', 0.0031101612919625164)
('outweighs', 0.0028371784072713504)
('ovarian', 0.0016019947429941168)
('overall', 0.014198792258863903)
('overcome', 0.0022998457750053754)
('overseen', 0.0008318990077993824)
('owned', 0.005370683581738746)
('owner', 0.003917144189725838)
('oxygen', 0.0005880280723727357)
('ozogamicin', 0.01788386496406321)
('pa', 0.003227128368410612)
('pacific', 0.008067047

('realize', 0.008262611973520514)
('realized', 0.006407176770456354)
('rearrangements', 0.0015320361574128428)
('reason', 0.005693963191140041)
('reasons', 0.0023409439187147044)
('receive', 0.012441196599818334)
('received', 0.016915766357038568)
('receives', 0.0021079425536489208)
('receiving', 0.018906632139703717)
('recent', 0.006384938450393641)
('recently', 0.004070084064983138)
('receptor', 0.011861214470041848)
('receptors', 0.00230127793419287)
('recipients', 0.0029870130929608146)
('recist', 0.002488134419696061)
('recognize', 0.0038454305953722324)
('recognized', 0.0018214029905578796)
('recognizes', 0.0038780117316828994)
('recombinant', 0.004858838858967937)
('recommend', 0.002186425326537764)
('recommendation', 0.008104177212927142)
('recommendations', 0.0057398924457279675)
('recommended', 0.01757503741078466)
('recommending', 0.0016544164172352458)
('recommends', 0.0015560796570671448)
('record', 0.010973502620241535)
('records', 0.0046004257961678304)
('recovered', 0.0

('strong', 0.02339791358048308)
('studied', 0.010241445703570011)
('studies', 0.03190632284620727)
('study', 0.059765593344388586)
('sub', 0.0031331223076173143)
('subcutaneous', 0.0029351412410459046)
('subfamilies', 0.0008925326306311553)
('subfamily', 0.0017850652612623106)
('subgroup', 0.003501802781298689)
('subgroups', 0.003283201998354825)
('subject', 0.014164770354578873)
('subjects', 0.001960370331751008)
('submission', 0.014089059797595428)
('submissions', 0.001555260061934848)
('submit', 0.0015427302592288888)
('submitted', 0.0171565907206537)
('submitting', 0.0006888725770573402)
('suboptimal', 0.0006481868862319279)
('subscribe', 0.003917144189725838)
('subsequent', 0.03345533859522832)
('subsequently', 0.001869617432691972)
('subset', 0.0012061797460410741)
('subsidiary', 0.002089377812277992)
('substance', 0.000823188773829302)
('substances', 0.001616750786509615)
('substantial', 0.01831914052325215)
('substrate', 0.000812483446364183)
('substrates', 0.00369511382187955)

('web', 0.006962371648066102)
('webcast', 0.04518423412908506)
('webcontent', 0.0014375683793583691)
('webpage', 0.0020992382786766735)
('website', 0.02200857524540777)
('wednesday', 0.014752225867659117)
('week', 0.015421349932987275)
('weekly', 0.0012299045216161478)
('weeks', 0.015633820844769743)
('weight', 0.00695848567896115)
('weighted', 0.0018584894388980783)
('weinberg', 0.0028863367180308303)
('welcome', 0.0019371404569357219)
('welfare', 0.002123172199125187)
('wellness', 0.020291030019439552)
('whale', 0.0006395759447364295)
('what', 0.001496999529288599)
('wheezing', 0.002770343477485859)
('wheler', 0.0006481868862319279)
('white', 0.005510799505797924)
('who', 0.0023500494332328557)
('wide', 0.01109278270845422)
('widely', 0.0034968511117806716)
('wild', 0.0021460393725082592)
('william', 0.0008591359998462475)
('williams', 0.0017441342522282514)
('winning', 0.0013782623178620447)
('with', 0.004097314118391365)
('withdraw', 0.0005227682074100435)
('withdrew', 0.0010341221

In [26]:
for i, component in enumerate(lsa.components_):
    terms_components = zip(terms, component)
    sorted_terms = sorted(terms_components, key=lambda x:x[1], reverse=True)[:10] # take features for topic
    print("topic : ", i)
    for term_socres in sorted_terms:
        print(10*" ", term_socres)
    print(50*'*')

topic :  0
           ('pfizer', 0.3009950925850856)
           ('patients', 0.269939434894878)
           ('xeljanz', 0.2272627688184445)
           ('cancer', 0.1711791080455353)
           ('bavencio', 0.12971215475923517)
           ('vs', 0.12548518729104635)
           ('treatment', 0.12505752166979314)
           ('clinical', 0.12457642736487425)
           ('eliquis', 0.12445554549333754)
           ('grade', 0.11690269544727168)
**************************************************
topic :  1
           ('xeljanz', 0.8236146873769018)
           ('xr', 0.3670100705248941)
           ('tofacitinib', 0.126101937277937)
           ('ra', 0.05441560360257133)
           ('infections', 0.05172864288745673)
           ('jak', 0.04606527101520123)
           ('methotrexate', 0.04515873940886639)
           ('arthritis', 0.04489023626478907)
           ('octave', 0.044721310955342874)
           ('citrate', 0.04303358245226345)
**************************************************
topic :  

#### Randomized SVD - https://research.fb.com/fast-randomized-svd/