# Text Mining with NLTK and Scikit-Learn

By: Axay Patel

In [35]:
import numpy as np
import pandas as pd

import nltk
import re

from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils.extmath import randomized_svd



## Preprocessing

In [5]:
# load tweets from bbchealth
col = ['id', 'date','tweet']
df = pd.read_csv('../data/Health-Tweets/bbchealth.txt', sep="|", names = col)

In [6]:
df.head()

Unnamed: 0,id,date,tweet
0,585978391360221184,Thu Apr 09 01:31:50 +0000 2015,Breast cancer risk test devised http://bbc.in/...
1,585947808772960257,Wed Apr 08 23:30:18 +0000 2015,GP workload harming care - BMA poll http://bbc...
2,585947807816650752,Wed Apr 08 23:30:18 +0000 2015,Short people's 'heart risk greater' http://bbc...
3,585866060991078401,Wed Apr 08 18:05:28 +0000 2015,New approach against HIV 'promising' http://bb...
4,585794106170839041,Wed Apr 08 13:19:33 +0000 2015,Coalition 'undermined NHS' - doctors http://bb...


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3929 entries, 0 to 3928
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      3929 non-null   int64 
 1   date    3929 non-null   object
 2   tweet   3929 non-null   object
dtypes: int64(1), object(2)
memory usage: 92.2+ KB


In [8]:
# no null values
df.isnull().sum()

id       0
date     0
tweet    0
dtype: int64

In [9]:
# no duplicate values
df.duplicated().sum()

0

In [10]:
df.count()

id       3929
date     3929
tweet    3929
dtype: int64

There are 3929 tweets in the Corpus.

In [11]:
# remove urls from tweets 
df['tweet'] = df['tweet'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', regex=True)

In [12]:
df['tweet'].head(20)

0                      Breast cancer risk test devised 
1                  GP workload harming care - BMA poll 
2                  Short people's 'heart risk greater' 
3                 New approach against HIV 'promising' 
4                 Coalition 'undermined NHS' - doctors 
5                   Review of case against NHS manager 
6     VIDEO: 'All day is empty, what am I going to d...
7        VIDEO: 'Overhaul needed' for end-of-life care 
8                      Care for dying 'needs overhaul' 
9             VIDEO: NHS: Labour and Tory key policies 
10                         Have GP services got worse? 
11                A&amp;E waiting hits new worst level 
12                   Parties row over GP opening hours 
13      Why strenuous runs may not be so bad after all 
14         VIDEO: Health surcharge for non-EU patients 
15        VIDEO: Skin cancer spike 'from 60s holidays' 
16               80,000 'might die' in future outbreak 
17                Skin cancer 'linked to holiday

In [13]:
#load stop words (http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words)
stop_words_file = open("../data/stop_words.txt")
stop_words = stop_words_file.read().splitlines()
stop_words_file.close()

print(stop_words)

['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', '

## Tokenizing and Stemming

In [14]:
ps = PorterStemmer()

corpus_stemmed = []
for tweet in df['tweet']:
    words = pd.Series(nltk.word_tokenize(tweet))
    stemmed_words = words.apply(ps.stem)
    corpus_stemmed.append(' '.join(list(stemmed_words)))
    
print('We processed {} documents.'.format(len(corpus_stemmed)))

We processed 3929 documents.


In [15]:
corpus_stemmed

['breast cancer risk test devis',
 'GP workload harm care - bma poll',
 "short peopl 's 'heart risk greater '",
 "new approach against hiv 'promis '",
 "coalit 'undermin nh ' - doctor",
 'review of case against nh manag',
 "video : 'all day is empti , what am I go to do ? '",
 "video : 'overhaul need ' for end-of-lif care",
 "care for die 'need overhaul '",
 'video : nh : labour and tori key polici',
 'have GP servic got wors ?',
 'A & amp ; E wait hit new worst level',
 'parti row over GP open hour',
 'whi strenuou run may not be so bad after all',
 'video : health surcharg for non-eu patient',
 "video : skin cancer spike 'from 60 holiday '",
 "80,000 'might die ' in futur outbreak",
 "skin cancer 'link to holiday boom '",
 "public 'back tax rise to fund nh '",
 'video : welcom to the design asylum',
 'video : whi are we have less sex ?',
 'five idea to transform the nh',
 "person cancer vaccin 'excit '",
 "child heart surgeri death 'halv '",
 "video : miliband : cameron 'fail ' the n

## TF-IDF

In [18]:
cv = CountVectorizer(stop_words=stop_words, token_pattern=r"(?u)\b\w[\w'’]*\w\b")

In [27]:
cv_fit = cv.fit_transform(corpus_stemmed)

word_list = cv.get_feature_names();    
count_list = cv_fit.toarray().sum(axis=0)    

freq_counts = dict(zip(word_list,count_list))
print( freq_counts )


{'00': 1, '000': 23, '10': 11, '100': 4, '10m': 1, '11': 2, '111': 3, '113': 1, '12': 4, '13': 2, '1300bc': 1, '14': 1, '15': 2, '150': 1, '16': 2, '162': 1, '16bn': 1, '17': 1, '18': 3, '19': 5, '1920': 1, '1970': 1, '1982': 1, '1990': 1, '1bn': 5, '1m': 2, '20': 9, '200': 2, '2000': 1, '2001': 1, '2003': 1, '2007': 1, '200k': 1, '2013': 2, '2014': 6, '2015': 5, '2020': 1, '2025': 1, '2030': 2, '208': 1, '21': 1, '22': 1, '227k': 1, '232': 1, '25': 3, '250m': 2, '28': 2, '29': 1, '2bn': 4, '2m': 3, '30': 3, '300': 1, '300m': 1, '30m': 3, '32bn': 1, '330': 2, '36': 1, '361': 1, '37': 1, '39': 1, '3d': 6, '3m': 2, '40': 7, '400': 2, '400m': 1, '47bn': 1, '48': 1, '480': 1, '4m': 1, '50': 13, '500': 3, '500m': 3, '55': 2, '570m': 1, '5bn': 2, '5m': 4, '60': 3, '63': 1, '65m': 1, '6bn': 2, '6m': 2, '70': 2, '700': 2, '700m': 1, '73': 1, '74': 1, '75': 2, '750': 1, '75m': 1, '78': 1, '7c': 1, '7m': 2, '80': 5, '80m': 1, '81': 1, '82m': 1, '88': 1, '8m': 1, '90': 1, '900': 2, '900m': 1, '91

In [32]:
dict(sorted(freq_counts.items(), key=lambda item: item[1], reverse = True))

{'video': 814,
 'ebola': 356,
 'nh': 349,
 'cancer': 217,
 'health': 194,
 'care': 189,
 'hospit': 175,
 'audio': 160,
 'patient': 140,
 'death': 125,
 'drug': 125,
 'new': 114,
 'uk': 106,
 'risk': 102,
 'help': 100,
 'amp': 95,
 'doctor': 95,
 'mental': 93,
 'test': 87,
 'babi': 85,
 'warn': 85,
 'gp': 81,
 'children': 68,
 'obes': 68,
 'case': 67,
 'face': 67,
 'heart': 66,
 'child': 65,
 'cut': 64,
 'need': 62,
 'brain': 60,
 'link': 59,
 'wait': 59,
 'vaccin': 58,
 'surgeri': 55,
 'rise': 54,
 'life': 53,
 'nurs': 53,
 'plan': 52,
 'dementia': 51,
 'ban': 48,
 'home': 48,
 'staff': 48,
 'die': 47,
 'live': 47,
 'save': 47,
 'fail': 46,
 'diseas': 45,
 'cigarett': 41,
 'drink': 40,
 'whi': 40,
 'diabet': 39,
 'hiv': 39,
 'target': 39,
 'time': 39,
 'blood': 38,
 'food': 38,
 'fund': 38,
 'trust': 38,
 'chang': 37,
 'fight': 36,
 'peopl': 36,
 'crisi': 35,
 'man': 35,
 'smoke': 35,
 'women': 35,
 'medic': 34,
 'year': 34,
 'breast': 33,
 'flu': 33,
 'rate': 33,
 'use': 33,
 'day': 3

### Top 20 most Frequent words 
| Word | Count |
| ----------- | ----------- |
| video | 814 |
| ebola | 356|
| nh | 349 |
|cancer | 217 |
|health| 194 |
| care | 189 |
| hospit | 175 |
| audio | 160 |
| patient | 140 |
| death | 125 |
| drug | 125 |
| new | 114 |
| uk | 106 |
| risk | 102 |
| help | 100 |
| amp | 95 |
| doctor | 95 |
| mental | 93 |
| test | 87 |
| babi | 85 |

In [41]:
pipe = Pipeline([('count', CountVectorizer(vocabulary=word_list)), ('tfid', TfidfTransformer())]).fit(corpus_stemmed)
pipe['count'].transform(corpus_stemmed).toarray()


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [42]:
pipe['tfid'].idf_

array([8.58324752, 6.09834087, 6.79148806, ..., 8.58324752, 8.58324752,
       8.17778242])

In [44]:
pipe.transform(corpus_stemmed).shape

(3929, 3212)

## Concept mapping using SVD

In [54]:
n_concepts = 10 # how many concepts would you like?
concepts = [(lambda x: 'concept{:d}'.format(x))(i+1) for i in range(n_concepts)]
U, Sigma, VT = randomized_svd(pipe.transform(corpus_stemmed), n_components=n_concepts)

In [70]:
# Document-to-concept matrix, as a Pandas DataFrame
U_df = pd.DataFrame(U, columns=concepts)

U_df['content'] = corpus_stemmed
U_df

Unnamed: 0,concept1,concept2,concept3,concept4,concept5,concept6,concept7,concept8,concept9,concept10,content
0,0.018068,0.006825,-0.023163,-0.031953,-0.019791,0.072302,0.015840,0.004682,-0.003437,-0.002672,breast cancer risk test devis
1,0.009606,0.011372,-0.004686,-0.013489,0.018730,-0.013925,0.034752,0.019899,-0.006388,-0.000261,GP workload harm care - bma poll
2,0.007982,0.003432,-0.002220,-0.009002,-0.001728,0.006027,-0.000245,0.005627,-0.015804,0.026049,short peopl 's 'heart risk greater '
3,0.006943,0.000206,-0.004582,0.003831,-0.002132,0.006393,0.001323,0.000700,-0.001931,0.014599,new approach against hiv 'promis '
4,0.013861,0.012494,-0.025481,0.035904,0.003420,-0.002224,-0.003733,0.001076,-0.003937,-0.003089,coalit 'undermin nh ' - doctor
...,...,...,...,...,...,...,...,...,...,...,...
3924,0.002117,0.000910,-0.001991,-0.004428,0.002816,-0.001324,-0.006357,-0.010488,-0.017487,0.019573,babi born after ovari 'reawaken '
3925,0.000267,0.000062,-0.000118,-0.000543,-0.000278,-0.000358,-0.000533,-0.000935,-0.001800,0.001710,ident triplet born against odd
3926,0.014500,0.016511,-0.017359,-0.030541,0.048947,-0.011852,-0.022243,-0.036288,0.028962,-0.022333,hospit fail to make improv
3927,0.031025,0.027237,-0.047411,0.043180,0.018419,0.005620,-0.034970,0.051704,-0.037723,-0.022733,new patient target pledg for nh


In [87]:
# Concept Strength
Sigma_df = pd.DataFrame(np.diag(Sigma), index=concepts, columns=concepts )

Sigma_df

Unnamed: 0,concept1,concept2,concept3,concept4,concept5,concept6,concept7,concept8,concept9,concept10
concept1,7.006142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
concept2,0.0,5.665431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
concept3,0.0,0.0,5.38447,0.0,0.0,0.0,0.0,0.0,0.0,0.0
concept4,0.0,0.0,0.0,5.068367,0.0,0.0,0.0,0.0,0.0,0.0
concept5,0.0,0.0,0.0,0.0,4.985926,0.0,0.0,0.0,0.0,0.0
concept6,0.0,0.0,0.0,0.0,0.0,4.917384,0.0,0.0,0.0,0.0
concept7,0.0,0.0,0.0,0.0,0.0,0.0,4.566786,0.0,0.0,0.0
concept8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.473145,0.0,0.0
concept9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.199721,0.0
concept10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.112359


In [78]:
VT_df = pd.DataFrame( VT, index=concepts, columns=word_list ).T
VT_df

Unnamed: 0,concept1,concept2,concept3,concept4,concept5,concept6,concept7,concept8,concept9,concept10
00,0.000170,0.000089,0.000050,-0.000169,-0.000206,-0.000014,-0.000158,-0.000326,-0.000129,0.000490
000,0.019433,-0.015101,-0.010329,-0.016161,0.013171,0.025201,-0.006914,0.003789,0.002702,0.008093
10,0.007125,-0.005275,-0.004962,-0.003452,0.002819,0.013941,0.008943,-0.000552,0.002235,-0.000865
100,0.004710,-0.008070,-0.000730,0.000297,0.004462,0.008042,0.001345,-0.001160,-0.003249,0.013525
10m,0.001321,-0.003485,0.000583,0.000735,0.001633,0.001313,0.000139,0.000145,-0.000118,-0.000136
...,...,...,...,...,...,...,...,...,...,...
yuk,0.000017,0.000004,0.000013,-0.000022,-0.000032,0.000034,0.000005,-0.000011,0.000006,-0.000045
zap,0.000065,0.000025,0.000047,0.000036,-0.000035,-0.000040,-0.000177,0.000254,0.000436,0.000236
zeneca,0.000361,-0.000039,-0.000548,-0.000291,-0.000652,0.001906,0.000730,-0.000096,-0.000211,0.000891
zero,0.000234,0.000205,-0.000234,-0.000215,0.000212,0.000694,0.000310,0.000284,0.000173,0.004149


In [88]:
VT_df.nlargest(10, 'concept1')

Unnamed: 0,concept1,concept2,concept3,concept4,concept5,concept6,concept7,concept8,concept9,concept10
video,0.603465,0.007437,0.066559,-0.203422,-0.528552,-0.425179,-0.037193,-0.122475,0.022809,-0.065304
ebola,0.369777,-0.71152,0.108742,0.169875,0.262313,0.106313,0.042548,0.045524,0.045177,-0.05303
nh,0.268459,0.266082,-0.446646,0.701479,0.02789,-0.005522,0.035417,-0.12414,-0.046478,-0.073762
health,0.212346,0.312924,0.580204,0.155498,0.112712,0.200439,-0.044443,0.005809,0.045073,-0.056068
cancer,0.168599,0.095031,-0.256073,-0.304946,-0.187105,0.625972,0.124158,0.008383,0.122128,-0.264251
care,0.163979,0.185718,-0.056795,-0.211237,0.386839,-0.23208,0.676161,0.165583,0.028788,0.034487
mental,0.147389,0.245863,0.445548,0.103582,0.090219,0.141799,-0.02707,0.009456,0.031072,-0.06906
patient,0.146812,0.064841,-0.116083,-0.136533,0.199511,-0.029284,-0.301691,0.445468,-0.546277,-0.317646
hospit,0.132859,0.114976,-0.166119,-0.303534,0.46442,-0.07556,-0.318368,-0.454222,0.310835,-0.211726
uk,0.127452,-0.184152,0.03465,0.008924,0.065425,0.091686,-0.006518,-0.005821,-0.04282,0.02513


In [89]:
VT_df.nlargest(10, 'concept2')

Unnamed: 0,concept1,concept2,concept3,concept4,concept5,concept6,concept7,concept8,concept9,concept10
health,0.212346,0.312924,0.580204,0.155498,0.112712,0.200439,-0.044443,0.005809,0.045073,-0.056068
nh,0.268459,0.266082,-0.446646,0.701479,0.02789,-0.005522,0.035417,-0.12414,-0.046478,-0.073762
mental,0.147389,0.245863,0.445548,0.103582,0.090219,0.141799,-0.02707,0.009456,0.031072,-0.06906
care,0.163979,0.185718,-0.056795,-0.211237,0.386839,-0.23208,0.676161,0.165583,0.028788,0.034487
hospit,0.132859,0.114976,-0.166119,-0.303534,0.46442,-0.07556,-0.318368,-0.454222,0.310835,-0.211726
cancer,0.168599,0.095031,-0.256073,-0.304946,-0.187105,0.625972,0.124158,0.008383,0.122128,-0.264251
cut,0.072662,0.085214,0.051375,-0.012695,0.015534,0.118648,-0.0303,0.028849,-0.094001,-0.019611
child,0.064211,0.083079,0.112881,0.007787,0.020362,0.043445,-0.026454,-0.022597,-0.047083,0.110805
patient,0.146812,0.064841,-0.116083,-0.136533,0.199511,-0.029284,-0.301691,0.445468,-0.546277,-0.317646
fail,0.053671,0.056764,-0.055716,-0.045087,0.107847,-0.053371,0.028036,-0.008111,-0.077236,-0.058716


In [90]:
VT_df.nlargest(10, 'concept3')

Unnamed: 0,concept1,concept2,concept3,concept4,concept5,concept6,concept7,concept8,concept9,concept10
health,0.212346,0.312924,0.580204,0.155498,0.112712,0.200439,-0.044443,0.005809,0.045073,-0.056068
mental,0.147389,0.245863,0.445548,0.103582,0.090219,0.141799,-0.02707,0.009456,0.031072,-0.06906
child,0.064211,0.083079,0.112881,0.007787,0.020362,0.043445,-0.026454,-0.022597,-0.047083,0.110805
ebola,0.369777,-0.71152,0.108742,0.169875,0.262313,0.106313,0.042548,0.045524,0.045177,-0.05303
servic,0.033133,0.051975,0.070673,0.013619,0.023218,0.036627,-0.004124,0.016154,0.025493,-0.025509
video,0.603465,0.007437,0.066559,-0.203422,-0.528552,-0.425179,-0.037193,-0.122475,0.022809,-0.065304
cut,0.072662,0.085214,0.051375,-0.012695,0.015534,0.118648,-0.0303,0.028849,-0.094001,-0.019611
uk,0.127452,-0.184152,0.03465,0.008924,0.065425,0.091686,-0.006518,-0.005821,-0.04282,0.02513
prioriti,0.013174,0.019745,0.034071,0.009727,0.000353,0.007962,-0.005851,0.000631,0.003454,-0.004786
crisi,0.045376,-0.006586,0.032371,0.021838,0.028562,0.007444,-0.008496,0.008896,0.05796,-0.00715


In [91]:
VT_df.nlargest(10, 'concept4')

Unnamed: 0,concept1,concept2,concept3,concept4,concept5,concept6,concept7,concept8,concept9,concept10
nh,0.268459,0.266082,-0.446646,0.701479,0.02789,-0.005522,0.035417,-0.12414,-0.046478,-0.073762
ebola,0.369777,-0.71152,0.108742,0.169875,0.262313,0.106313,0.042548,0.045524,0.045177,-0.05303
health,0.212346,0.312924,0.580204,0.155498,0.112712,0.200439,-0.044443,0.005809,0.045073,-0.056068
staff,0.06133,0.02305,-0.055158,0.116465,0.064138,-0.01632,0.01041,-0.022378,0.028818,-0.043609
mental,0.147389,0.245863,0.445548,0.103582,0.090219,0.141799,-0.02707,0.009456,0.031072,-0.06906
strike,0.025673,0.021698,-0.037896,0.082471,0.011764,-0.001993,-0.001112,-0.024151,0.009015,-0.017203
new,0.092878,0.029438,-0.059195,0.062918,-0.022518,0.069827,-0.017204,-0.001918,-0.018226,0.096945
spend,0.025044,0.031169,0.009638,0.047237,0.000739,0.003655,-8.5e-05,-0.007309,0.004933,-0.006398
pay,0.020381,0.023693,-0.020535,0.039162,0.011115,-0.006352,0.028273,-0.005547,0.003361,-0.003333
nurs,0.059455,-0.062859,0.000748,0.03915,0.046398,0.012014,0.009261,0.009572,0.010496,0.003233


In [92]:
VT_df.nlargest(10, 'concept5')

Unnamed: 0,concept1,concept2,concept3,concept4,concept5,concept6,concept7,concept8,concept9,concept10
hospit,0.132859,0.114976,-0.166119,-0.303534,0.46442,-0.07556,-0.318368,-0.454222,0.310835,-0.211726
care,0.163979,0.185718,-0.056795,-0.211237,0.386839,-0.23208,0.676161,0.165583,0.028788,0.034487
ebola,0.369777,-0.71152,0.108742,0.169875,0.262313,0.106313,0.042548,0.045524,0.045177,-0.05303
patient,0.146812,0.064841,-0.116083,-0.136533,0.199511,-0.029284,-0.301691,0.445468,-0.546277,-0.317646
death,0.082312,0.032977,-0.057697,-0.113123,0.187875,0.047526,-0.21993,-0.241598,-0.299126,0.380182
health,0.212346,0.312924,0.580204,0.155498,0.112712,0.200439,-0.044443,0.005809,0.045073,-0.056068
fail,0.053671,0.056764,-0.055716,-0.045087,0.107847,-0.053371,0.028036,-0.008111,-0.077236,-0.058716
home,0.041901,0.039609,-0.025706,-0.069495,0.103727,-0.079527,0.191292,0.0332,0.006071,0.024284
mental,0.147389,0.245863,0.445548,0.103582,0.090219,0.141799,-0.02707,0.009456,0.031072,-0.06906
uk,0.127452,-0.184152,0.03465,0.008924,0.065425,0.091686,-0.006518,-0.005821,-0.04282,0.02513


In [93]:
VT_df.nlargest(10, 'concept6')

Unnamed: 0,concept1,concept2,concept3,concept4,concept5,concept6,concept7,concept8,concept9,concept10
cancer,0.168599,0.095031,-0.256073,-0.304946,-0.187105,0.625972,0.124158,0.008383,0.122128,-0.264251
drug,0.102311,-0.00609,-0.105822,-0.047529,-0.103454,0.29768,0.106226,-0.006349,-0.014794,0.073426
health,0.212346,0.312924,0.580204,0.155498,0.112712,0.200439,-0.044443,0.005809,0.045073,-0.056068
breast,0.041696,0.023781,-0.060438,-0.069562,-0.073054,0.16661,0.045977,-0.005305,0.023617,-0.052759
test,0.075016,-0.034691,-0.047346,-0.026228,-0.01983,0.141968,0.051066,0.006551,-0.0397,0.117843
mental,0.147389,0.245863,0.445548,0.103582,0.090219,0.141799,-0.02707,0.009456,0.031072,-0.06906
cut,0.072662,0.085214,0.051375,-0.012695,0.015534,0.118648,-0.0303,0.028849,-0.094001,-0.019611
audio,0.056858,0.046439,-0.041398,-0.032562,0.050617,0.114427,0.045311,0.051462,0.041627,0.432547
ebola,0.369777,-0.71152,0.108742,0.169875,0.262313,0.106313,0.042548,0.045524,0.045177,-0.05303
risk,0.078103,0.032357,-0.002785,-0.074085,-0.001981,0.092349,-0.020196,0.046282,-0.136301,0.127891


In [94]:
VT_df.nlargest(10, 'concept7')

Unnamed: 0,concept1,concept2,concept3,concept4,concept5,concept6,concept7,concept8,concept9,concept10
care,0.163979,0.185718,-0.056795,-0.211237,0.386839,-0.23208,0.676161,0.165583,0.028788,0.034487
home,0.041901,0.039609,-0.025706,-0.069495,0.103727,-0.079527,0.191292,0.0332,0.006071,0.024284
cancer,0.168599,0.095031,-0.256073,-0.304946,-0.187105,0.625972,0.124158,0.008383,0.122128,-0.264251
drug,0.102311,-0.00609,-0.105822,-0.047529,-0.103454,0.29768,0.106226,-0.006349,-0.014794,0.073426
elderli,0.03375,0.031255,-0.027819,-0.040399,0.055088,-0.040195,0.063645,0.050082,-0.017065,-0.024355
concern,0.023751,0.023515,0.00663,-0.024041,0.026886,0.003699,0.062842,0.005705,-0.001646,0.03565
need,0.051409,0.042843,0.018247,0.015155,0.048607,-0.004633,0.05697,0.020178,0.003841,0.043888
life,0.040924,0.003977,-0.016244,-0.029005,-0.01206,0.0022,0.056554,0.016128,-0.003482,0.061838
poor,0.017862,0.018124,-0.010205,-0.025534,0.035413,-0.010456,0.0529,-0.003035,-0.000148,0.028197
test,0.075016,-0.034691,-0.047346,-0.026228,-0.01983,0.141968,0.051066,0.006551,-0.0397,0.117843


In [95]:
VT_df.nlargest(10, 'concept8')

Unnamed: 0,concept1,concept2,concept3,concept4,concept5,concept6,concept7,concept8,concept9,concept10
patient,0.146812,0.064841,-0.116083,-0.136533,0.199511,-0.029284,-0.301691,0.445468,-0.546277,-0.317646
amp,0.06466,0.056139,-0.088243,-0.019589,0.055703,-0.062072,-0.245744,0.362593,0.406045,0.19773
wait,0.051715,0.048373,-0.070335,0.006354,0.013452,-0.022834,-0.193962,0.325777,0.227716,0.10855
target,0.044008,0.03767,-0.069311,0.016471,0.009801,0.007113,-0.098908,0.182238,0.166299,0.046422
time,0.038099,0.026212,-0.041614,0.009667,-0.002443,-0.009955,-0.09018,0.17132,0.139741,0.065754
care,0.163979,0.185718,-0.056795,-0.211237,0.386839,-0.23208,0.676161,0.165583,0.028788,0.034487
miss,0.034113,0.025277,-0.036335,0.00052,0.009429,0.008501,-0.058178,0.15218,0.127449,0.035049
gp,0.064375,0.045475,-0.024029,-0.061753,-0.016346,-0.026584,-0.032614,0.142896,-0.111288,-0.047872
hour,0.0227,0.01133,-0.019749,-0.000855,0.009336,-0.017713,-0.033834,0.09954,0.062059,0.03571
doctor,0.070968,-0.002883,-0.04465,-0.025194,0.022816,-0.021698,-0.074158,0.098647,-0.018346,0.019314


In [96]:
VT_df.nlargest(10, 'concept9')

Unnamed: 0,concept1,concept2,concept3,concept4,concept5,concept6,concept7,concept8,concept9,concept10
amp,0.06466,0.056139,-0.088243,-0.019589,0.055703,-0.062072,-0.245744,0.362593,0.406045,0.19773
hospit,0.132859,0.114976,-0.166119,-0.303534,0.46442,-0.07556,-0.318368,-0.454222,0.310835,-0.211726
wait,0.051715,0.048373,-0.070335,0.006354,0.013452,-0.022834,-0.193962,0.325777,0.227716,0.10855
target,0.044008,0.03767,-0.069311,0.016471,0.009801,0.007113,-0.098908,0.182238,0.166299,0.046422
time,0.038099,0.026212,-0.041614,0.009667,-0.002443,-0.009955,-0.09018,0.17132,0.139741,0.065754
miss,0.034113,0.025277,-0.036335,0.00052,0.009429,0.008501,-0.058178,0.15218,0.127449,0.035049
cancer,0.168599,0.095031,-0.256073,-0.304946,-0.187105,0.625972,0.124158,0.008383,0.122128,-0.264251
improv,0.028789,0.030512,-0.010763,-0.020211,0.034361,-0.012936,0.012002,0.033944,0.071667,0.031477
england,0.029412,0.029192,-0.049176,0.032466,-0.002685,0.006312,-0.027725,0.071018,0.06796,0.025478
vaccin,0.076079,-0.13587,0.020989,0.015927,0.00519,0.015841,0.02949,-0.003262,0.066081,0.027509


In [97]:
VT_df.nlargest(10, 'concept10')

Unnamed: 0,concept1,concept2,concept3,concept4,concept5,concept6,concept7,concept8,concept9,concept10
audio,0.056858,0.046439,-0.041398,-0.032562,0.050617,0.114427,0.045311,0.051462,0.041627,0.432547
death,0.082312,0.032977,-0.057697,-0.113123,0.187875,0.047526,-0.21993,-0.241598,-0.299126,0.380182
babi,0.040727,0.015119,-0.030742,-0.05899,0.043141,-0.014129,-0.07485,-0.120903,-0.177817,0.221711
amp,0.06466,0.056139,-0.088243,-0.019589,0.055703,-0.062072,-0.245744,0.362593,0.406045,0.19773
heart,0.040514,0.017337,-0.011673,-0.037228,-0.002549,-0.00752,-0.007609,0.019774,-0.077993,0.153747
warn,0.07906,0.041612,0.029167,-0.033206,0.012005,0.073801,-0.024566,-0.017648,0.017636,0.132899
risk,0.078103,0.032357,-0.002785,-0.074085,-0.001981,0.092349,-0.020196,0.046282,-0.136301,0.127891
test,0.075016,-0.034691,-0.047346,-0.026228,-0.01983,0.141968,0.051066,0.006551,-0.0397,0.117843
rise,0.045099,0.026118,0.010406,-0.000765,0.021982,0.030412,-0.041992,-0.001034,0.008915,0.111378
child,0.064211,0.083079,0.112881,0.007787,0.020362,0.043445,-0.026454,-0.022597,-0.047083,0.110805


### Top 10 terms for each concept
| Concept | Terms |
| ----------- | ----------- |
| Concept1 |video, ebola, nh, health, cancer, care, mental, patient, hospit, uk|
| Concept2 |health, nh, mental, care, hospit, cancer, cut, child, patient, fail|
| Concept3 |health, menttal, child, ebola, servic, video, cut , uk, prioriti, crisi |
| Concept4 |nh, ebola, health, staff, mental, strike, new, spen, pay, nurs |
| Concept5 |hospit, care, ebola, patient, death, health, fail, home, mental, uk |
| Concept6 |cancer, drug, health, breast, test, mental, cut, audio, ebola, risk|
| Concept7 |care, home, cancer, drug, elderli, concern, need, life, poor |
| Concept8 |patient, amp, wait, target, time, care, miss, gp, hour, doctor|
| Concept9 |amp, hospit, wait, target, time, miss, cancer, improv, england, vaccin |
| Concept10 | audio, death, babi, amp , heart, warn, risk, test, rise, child|

### Top 10 Tweets for each Concept 

In [105]:
# Concept 1
U_df.nlargest(10, 'concept1')['content']

1445               video : nh staff to help in ebola area
486              video : stigma of care for ebola patient
2887               video : video goggl for hospit patient
966          video : nh staff set off to help fight ebola
2898             video : child mental health care concern
2984                  video : health warn on nh spend cut
2383          video : mental health patient 'hit by cut '
278     video : the children let down by mental health...
1366                video : first US ebola case 'critic '
1352               video : ebola : how can you catch it ?
Name: content, dtype: object

In [106]:
# Concept 2
U_df.nlargest(10, 'concept2')['content']

279                     nh child mental health care pledg
3110                   clegg attack nh mental health care
2898             video : child mental health care concern
2535                    mental health cut cost nh million
2590         audio : mental health care 'need transform '
2388                     cut 'hit mental health patient '
2760                       mental health cut 'risk live '
2383          video : mental health patient 'hit by cut '
1337        video : mental health care 'must be instant '
278     video : the children let down by mental health...
Name: content, dtype: object

In [107]:
# Concept 3
U_df.nlargest(10, 'concept3')['content']

3793                   mental health servic 'in crisi '
266     video : child mental health 'must be prioriti '
271                prioriti call on child mental health
2898           video : child mental health care concern
963                         mental health nurs cut warn
50                  'rise in child mental health issu '
121        video : child mental health 'need overhaul '
100                  mental health budget 'cut by 8 % '
134                  mental health 'to get fund boost '
133                   video : mental health spend boost
Name: content, dtype: object

In [108]:
# Concept 4
U_df.nlargest(10, 'concept4')['content']

1291               nh strike : what next ?
400        nh 111 'increas pressur on nh '
743         nh staff to strike in new year
1170    how nh staff will help treat ebola
1171    how nh staff will help treat ebola
967     nh staff to fli out to fight ebola
1139        new strike by nh staff announc
471     unison call off strike by nh staff
2159                  drop the ' N ' in nh
2535     mental health cut cost nh million
Name: content, dtype: object

In [109]:
# Concept 5
U_df.nlargest(10, 'concept5')['content']

2385                   hospit 'fail ' two patient
2323                    hospit care fail 'shock '
2427              hospit warn on oap patient care
3723           poor care risk 'at 1 in 4 hospit '
2120            hospit patient to get name doctor
938     A & amp ; E care 'still unsaf ' at hospit
1903                             hospit at home ?
348                    hospit matern care concern
2465     elderli care : 'who will care for me ? '
1348                who warn of ebola hospit risk
Name: content, dtype: object

In [110]:
# Concept 6
U_df.nlargest(10, 'concept6')['content']

3330              drug 'halv ' breast cancer risk
1296            nh rule on new breast cancer drug
1720              nh say no to breast cancer drug
2484             breast cancer drug price cut urg
3150            'quicker ' drug for breast cancer
2605            breast cancer 'person drug ' hope
198           cancer drug patient 's england move
2338    audio : trial cancer drug 'save my life '
408                 blood cancer drug offer on nh
3893               walk 'cut breast cancer risk '
Name: content, dtype: object

In [111]:
# Concept 7
U_df.nlargest(10, 'concept7')['content']

2465    elderli care : 'who will care for me ? '
3649      video : home care system under pressur
3100       video : move to improv care home life
1960            cancer care 'could be privatis '
358       one in five care home 'fail key test '
2864                care home criticis in report
833               'take care complaint serious '
3094             fail elderli care home to close
2866              eleven care home staff suspend
1879                plan to tackl fail care home
Name: content, dtype: object

In [112]:
# Concept 8
U_df.nlargest(10, 'concept8')['content']

237      rise in A & amp ; E wait time for patient
234                  patient wait time target miss
2236                 patient miss wait time target
199              A & amp ; E wait time target miss
2354           A & amp ; E wait time target improv
292          fewer A & amp ; E patient wait longer
2262    most patient 'right to go to A & amp ; E '
782          A & amp ; E wait time target question
2032                patient 'face longer GP wait '
3868    A & amp ; E pressur 'put patient at risk '
Name: content, dtype: object

In [113]:
# Concept 9
U_df.nlargest(10, 'concept9')['content']

199                     A & amp ; E wait time target miss
2354                  A & amp ; E wait time target improv
782                 A & amp ; E wait time target question
2258       audio : To A & amp ; E or not to A & amp ; E ?
137     video : hospit 'struggl ' with A & amp ; E target
505               A & amp ; E wait time in england improv
2618           mani A & amp ; Es 'have miss wait target '
320                A & amp ; E wait improv across england
1823            A & amp ; E unit miss 12-hour wait target
143               A & amp ; E target miss whole of winter
Name: content, dtype: object

In [114]:
# Concept 10
U_df.nlargest(10, 'concept10')['content']

2258    audio : To A & amp ; E or not to A & amp ; E ?
23                   child heart surgeri death 'halv '
2584                      'no apolog ' over babi death
1035                      surgeon warn over death rate
2776       audio : 'whi I would take alzheim 's test '
3603           audio : should worker face drink test ?
201                      babi death inquiri report due
2226          audio : sugari drink 'should have warn '
2944                  rise in death from 'legal high '
2290         audio : childlin drink and drug call rise
Name: content, dtype: object

### Names for each concept
| Concept | Name | Reasoning
| ----------- | ----------- |----------- |
| Concept1 |Current issues | mentions ebola and mental health |
| Concept2 |Mental Health | Mental halth mentioned frequently |
| Concept3 |Child mental health  | Child Mental halth menbtioned frequently |
| Concept4 |Nh Staff | NH Staff menbtioned frequently |
| Concept5 |Failures | Mentions how hospital fails in tweets |
| Concept6 |Breast Cancer | Tweets talk about breast cancer |
| Concept7 |Assisted Living | Mentions the elderly, moving and care homes |
| Concept8 |Patient updates | Tweets seem to be monitoring patient |
| Concept9 |Wait time | Tweets mention waiting for target |
| Concept10 | Dangers | Tweets mention death rates |