In [None]:
# -*- coding: utf-8 -*-
import pymysql.cursors
import pandas as pd
import os
import matplotlib.pyplot as plt
import textwrap
from scipy import stats
import math

In [None]:
connection = pymysql.connect(host='localhost',
                             user='root',
                             password=os.environ.get('mysql_password', '960728'),
                             db='hdf',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)

In [None]:
def query(sql):
    try:
        with connection.cursor() as cursor:
            cursor.execute(sql)
            result = cursor.fetchall()
            return pd.DataFrame(result, columns=result[0].keys())
    finally:
        connection.close()

In [None]:
sql = textwrap.dedent("""
    SELECT
        ME,
        MC,
        CS,
        MAP,
        OP,
        F,
        POP,
        SOP,
        DIS,
        usefulre,
        disease_cat,
        comment_score,
        number_of_comments,
        doctorProfession,
        hospital_grade
    FROM ultra_ultimate;
""")

In [None]:
df = query(sql)
df = df.astype({'ME':'float','MC':'float','CS':'float','MAP':'float','OP':'float','F':'float','POP':'float','SOP':'float','DIS':'float',})

In [None]:
df.usefulre = df.usefulre.apply(lambda x: math.log(x+0.01, 10))
df.comment_score = df.comment_score.apply(lambda x: math.log(x+0.01, 10))
df.number_of_comments = df.number_of_comments.apply(lambda x: math.log(x, 10))

In [None]:
def segment(df, condition, cname):
    for n, x in df.groupby([df['disease_cat'],condition]):
        doccap = 'high' if n[1] else 'low'
        filename = n[0] + '_' + cname + '_' + doccap + '.csv'
        x.reset_index(drop=True, inplace=True)
        x.to_csv(filename, columns = ['ME', 'MC', 'CS', 'MAP', 'OP', 'F', 'POP', 'SOP', 'DIS', 'usefulre'])

In [None]:
segment(df, df['doctorProfession'].isin(['副主任医师','主任医师']), 'title')
segment(df, df['hospital_grade'].isin(['三甲','三级']), 'grade')
segment(df, df['number_of_comments'] > df.number_of_comments.median(), 'noc')
segment(df, df['comment_score'] > df.comment_score.median(), 'cs')