sentiment score for each post.

export file columns:
- compound score (maybe add pos, neg, and score)
- gender
- role
- age

07-12-2018

---

In [91]:
import pandas as pd

In [92]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [93]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://zhiyzuo@localhost:5432/ic2s2_datathon')

---

#### extract information

In [94]:
text_df = pd.read_sql_table(table_name='text', con=engine)
text_df.head(2)

Unnamed: 0,qa_id,text,asker_id,answerer_id,type,post_user_id,disease
0,0,"Hello, I'm a male in his 20s and I've had fati...",TigerShot,"Faye Lang, RN, MSW",q,TigerShot,psychiatric topics
1,1,Hello.. You might remember me contacting you 3...,marcia smith,"Faye Lang, RN, MSW",q,marcia smith,psychiatric topics


In [95]:
text_df.tail(2)

Unnamed: 0,qa_id,text,asker_id,answerer_id,type,post_user_id,disease
22001,79,"Dear CED, I havent heard anything to indicate ...",CED,Dr. Tamer Fouad,a,Dr. Tamer Fouad,leukemia
22002,80,"Dear Janet, Leukemia is an aggressive disease ...",JANET B,Dr. Tamer Fouad,a,Dr. Tamer Fouad,leukemia


setup an index so that there's a unique id

In [96]:
text_df['index'] = pd.np.arange(text_df.shape[0])
text_df.head(2)

Unnamed: 0,qa_id,text,asker_id,answerer_id,type,post_user_id,disease,index
0,0,"Hello, I'm a male in his 20s and I've had fati...",TigerShot,"Faye Lang, RN, MSW",q,TigerShot,psychiatric topics,0
1,1,Hello.. You might remember me contacting you 3...,marcia smith,"Faye Lang, RN, MSW",q,marcia smith,psychiatric topics,1


subset

In [97]:
df = text_df[['index', 'text', 'type', 'disease', 'post_user_id']]
df.head(2)

Unnamed: 0,index,text,type,disease,post_user_id
0,0,"Hello, I'm a male in his 20s and I've had fati...",q,psychiatric topics,TigerShot
1,1,Hello.. You might remember me contacting you 3...,q,psychiatric topics,marcia smith


In [98]:
df.shape

(22003, 5)

---

#### calculate sentiment score

In [99]:
from joblib import Parallel, delayed

In [100]:
analyzer = SentimentIntensityAnalyzer()

In [101]:
n_jobs = 28
score_list = Parallel(n_jobs=n_jobs)(delayed(analyzer.polarity_scores)(s) for s in df['text'].values)

In [102]:
score_list[0]

{'neg': 0.123, 'neu': 0.776, 'pos': 0.101, 'compound': -0.6515}

In [103]:
score_df = pd.DataFrame(score_list)
score_df.head(2)

Unnamed: 0,compound,neg,neu,pos
0,-0.6515,0.123,0.776,0.101
1,-0.3861,0.143,0.719,0.138


In [104]:
score_df.shape

(22003, 4)

##### merge

add index to merge them back on

In [105]:
score_df['index'] = pd.np.arange(score_df.shape[0])

In [106]:
df = df.merge(score_df, on='index')
df.head(2)

Unnamed: 0,index,text,type,disease,post_user_id,compound,neg,neu,pos
0,0,"Hello, I'm a male in his 20s and I've had fati...",q,psychiatric topics,TigerShot,-0.6515,0.123,0.776,0.101
1,1,Hello.. You might remember me contacting you 3...,q,psychiatric topics,marcia smith,-0.3861,0.143,0.719,0.138


---

#### merge with user

get user info

In [107]:
user_table = pd.read_sql_table('user', engine)
user_table.head(2)

Unnamed: 0,age,gender,role,user_id,id
0,74.0,Female,Nurse,"Faye Lang, RN, MSW",0
1,,Male,Psychotherapist,Tim W Latsko,66


remove NAs

In [108]:
user_table.dropna(subset=['gender', 'user_id'], inplace=True, how='all')

In [109]:
df.head(2)

Unnamed: 0,index,text,type,disease,post_user_id,compound,neg,neu,pos
0,0,"Hello, I'm a male in his 20s and I've had fati...",q,psychiatric topics,TigerShot,-0.6515,0.123,0.776,0.101
1,1,Hello.. You might remember me contacting you 3...,q,psychiatric topics,marcia smith,-0.3861,0.143,0.719,0.138


In [110]:
df.rename(columns={'post_user_id': 'user_id'}, inplace=True)

In [111]:
df = df.query("user_id in @user_table.user_id")

In [112]:
df = df.merge(user_table[user_table.columns[:-1]], on='user_id')

In [113]:
df.head(2)

Unnamed: 0,index,text,type,disease,user_id,compound,neg,neu,pos,age,gender,role
0,0,"Hello, I'm a male in his 20s and I've had fati...",q,psychiatric topics,TigerShot,-0.6515,0.123,0.776,0.101,32.0,Male,patient
1,2207,I'm a 23 year old male and my problems started...,q,neurology topics,TigerShot,-0.976,0.145,0.793,0.062,32.0,Male,patient


In [114]:
df.gender.unique()

array(['Male', 'Female'], dtype=object)

strip `gender` and map to binary

In [115]:
df['gender'] = df['gender'].str.strip()
df['gender'] = df.gender.map({'Female':1, 'Male':0, None:-1})

In [116]:
df.head(2)

Unnamed: 0,index,text,type,disease,user_id,compound,neg,neu,pos,age,gender,role
0,0,"Hello, I'm a male in his 20s and I've had fati...",q,psychiatric topics,TigerShot,-0.6515,0.123,0.776,0.101,32.0,0,patient
1,2207,I'm a 23 year old male and my problems started...,q,neurology topics,TigerShot,-0.976,0.145,0.793,0.062,32.0,0,patient


convert role

In [117]:
df['role'].unique()

array(['patient', 'Medical Doctor', 'Nurse', 'Paramedic',
       'Physician Assistant', 'Psychotherapist', 'Medical Assistant',
       'Nurse Assistant', 'Pharmacist', 'Medical Student'], dtype=object)

In [119]:
df['role_binary'] = (df['role'].values == 'patient').astype(int)

In [120]:
df.head(2)

Unnamed: 0,index,text,type,disease,user_id,compound,neg,neu,pos,age,gender,role,role_binary
0,0,"Hello, I'm a male in his 20s and I've had fati...",q,psychiatric topics,TigerShot,-0.6515,0.123,0.776,0.101,32.0,0,patient,1
1,2207,I'm a 23 year old male and my problems started...,q,neurology topics,TigerShot,-0.976,0.145,0.793,0.062,32.0,0,patient,1


In [122]:
df.to_csv('data/sentiment.csv', index=False)

In [126]:
df.drop(columns=['text', 'user_id']).to_stata('data/sentiment.dta')