In [None]:
import textwrap
import pyodbc
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

In [None]:
sql1 = textwrap.dedent("""
    IF OBJECT_ID('tempdb..##tmp') IS NOT NULL
        DROP TABLE ##tmp
""")
sql2 = textwrap.dedent("""
    SELECT userid
          ,date
          ,platform
          ,channel
          ,COUNT(url) AS intensity
    INTO ##tmp
    FROM [data].[dbo].[health_records]
    GROUP BY userid, date, platform, channel
    HAVING platform IS NOT NULL AND channel IS NOT NULL
    ORDER BY userid, date, platform, channel
""")
sql3 = textwrap.dedent("""
    IF OBJECT_ID('tempdb..##j') IS NOT NULL
        DROP TABLE ##j
""")
sql4 = textwrap.dedent("""
    SELECT userid
          ,platform
          ,channel
          ,AVG(intensity) AS intensity
    INTO ##j
    FROM ##tmp
    GROUP BY userid, platform, channel
    ORDER BY userid, platform, channel
""")
sql5 = textwrap.dedent("""
    SELECT j.platform
          ,j.channel
          ,u.gender
          ,u.consumption AS income
          ,j.intensity
    FROM ##j j
    JOIN [data].[dbo].[user] u
    ON u.userid = j.userid
""")

In [None]:
# 数据分布图
def dist_plot(s):
    plt.figure()
    s.plot.hist(bins=15)
    plt.show()
    return None

In [None]:
# 对数据做幂次变换
def power_trans(df,col):
    dist_plot(df[col])
    df[col+'_t'], _ = stats.boxcox(df[col])
    dist_plot(df[col+'_t'])
    print(stats.normaltest(df[col+'_t']))
    return df

In [None]:
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=localhost;DATABASE=data;')

cursor = cnxn.cursor()

In [None]:
cursor.execute(sql1)
cursor.execute(sql2)
cursor.execute(sql3)
cursor.execute(sql4)
rows = cursor.execute(sql5).fetchall()
df = pd.DataFrame( [[ij for ij in i] for i in rows] )
df = df.rename(columns={0:'platform',1:'channel',2:'gender',3:'income',4:'intensity'})
df = df[df['intensity']>2]
df = df[df['income']<50000]
df.describe()

In [None]:
# 0-女 1-男
df.gender=df.gender.apply(lambda x:(1,0)[x=='女'])
# 0-browser 1-app
df.channel=df.channel.apply(lambda x:(1,0)[x=='browser'])
# 0-iphone 1-android
df.platform=df.platform.apply(lambda x:(1,0)[x=='iphone'])

In [None]:
df = power_trans(df,'income')
df = power_trans(df,'intensity')

In [None]:
df.reset_index(drop=True, inplace=True)
df['id']=df.index+1
# 更改id列顺序
uid = df['id']
df.drop(labels=['id'], axis=1,inplace = True)
df.insert(0, 'id', uid)

In [None]:
df[['income_t','intensity_t']].quantile(0.95)
df[['income_t','intensity_t']].quantile(0.5)
df[['income_t','intensity_t']].quantile(0.05)

In [None]:
df.to_csv('intensity.csv',columns=['id','platform','channel','gender','income_t','intensity_t'],header=['id','platform','channel','gender','income','intensity'],index=False)