In [1]:
#! /usr/bin/python
# -*- coding: utf-8 -*-

In [2]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pyodbc
import textwrap
import pandas as pd
import numpy as np
from scipy import stats

In [3]:
# 总访问量 - channel*income
sql1 = textwrap.dedent("""
    SELECT r.userid
          ,r.channel
          ,IIF(u.consumption>10000,'high','low') AS income
          ,COUNT(r.url) AS [Number of Visits]
    FROM [data].[dbo].[health_records] r
    JOIN [data].[dbo].[user] u
    ON r.userid = u.userid
    GROUP BY r.userid, r.channel, u.consumption
    HAVING r.channel IS NOT NULL AND u.consumption IS NOT NULL
    ORDER BY r.userid
""")
# 总访问量 - platform*income
sql2 = textwrap.dedent("""
    SELECT r.userid
          ,r.platform
          ,IIF(u.consumption>10000,'high','low') AS income
          ,COUNT(r.url) AS [Number of Visits]
    FROM [data].[dbo].[health_records] r
    JOIN [data].[dbo].[user] u
    ON r.userid = u.userid
    GROUP BY r.userid, r.platform, u.consumption
    HAVING r.platform IS NOT NULL AND u.consumption IS NOT NULL
    ORDER BY r.userid
""")
# 总访问量 - gender*income
sql3 = textwrap.dedent("""
    SELECT r.userid
          ,u.gender
          ,IIF(u.consumption>10000,'high','low') AS income
          ,COUNT(r.url) AS [Number of Visits]
    FROM [data].[dbo].[health_records] r
    JOIN [data].[dbo].[user] u
    ON r.userid = u.userid
    GROUP BY r.userid, u.gender, u.consumption
    HAVING u.gender IS NOT NULL AND u.consumption IS NOT NULL
    ORDER BY r.userid
""")
# 总访问量 - platform*gender
sql4 = textwrap.dedent("""
    SELECT r.userid
          ,r.platform
          ,u.gender
          ,COUNT(r.url) AS [Number of Visits]
    FROM [data].[dbo].[health_records] r
    JOIN [data].[dbo].[user] u
    ON r.userid = u.userid
    GROUP BY r.userid, r.platform, u.gender
    HAVING r.platform IS NOT NULL AND u.gender IS NOT NULL
    ORDER BY r.userid
""")
# 总访问量 - channel*gender
sql5 = textwrap.dedent("""
    SELECT r.userid
          ,r.channel
          ,u.gender
          ,COUNT(r.url) AS [Number of Visits]
    FROM [data].[dbo].[health_records] r
    JOIN [data].[dbo].[user] u
    ON r.userid = u.userid
    GROUP BY r.userid, r.channel, u.gender
    HAVING r.channel IS NOT NULL AND u.gender IS NOT NULL
    ORDER BY r.userid
""")
# 使用强度 - all
sql6 = textwrap.dedent("""
    SELECT r.userid
          ,r.platform
          ,r.channel
          ,COUNT(r.url) AS [Use Intensity]
          ,u.gender
          ,IIF(u.consumption>10000,'high','low') AS income
    FROM [data].[dbo].[health_records] r
    JOIN [data].[dbo].[user] u
    ON r.userid = u.userid
    GROUP BY r.userid, r.date, r.platform, r.channel, u.gender, u.consumption
    HAVING r.platform IS NOT NULL AND r.channel IS NOT NULL AND u.gender IS NOT NULL AND u.consumption IS NOT NULL
    ORDER BY r.userid, r.date
""")

In [4]:
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=localhost;DATABASE=data;')

cursor = cnxn.cursor()

In [5]:
# 执行sql，并将结果加载为dataframe
def load_data(sql):
    rows = cursor.execute(sql).fetchall()
    return pd.DataFrame( [[ij for ij in i] for i in rows] )

In [6]:
# 对数据做幂次变换
def power_trans(df):
    df['yt'], _ = stats.boxcox(df[3])
    return df

In [7]:
def analysis(sql,c1,c2):
    df = load_data(sql)
    df = power_trans(df)
    locals()[c1] = df[1]
    locals()[c2] = df[2]
    yt = df['yt']
    formula = 'yt~C({c1})+C({c2})+C({c1}):C({c2})'.format(c1=c1,c2=c2)
    anova_results = sm.stats.anova_lm(ols(formula,df).fit(),typ=1)
    print(anova_results)
    return None

In [8]:
# 使用强度ANOVA
df = load_data(sql6)
df = power_trans(df)
platform = df[1]
channel = df[2]
gender = df[4]
income = df[5]
yt = df['yt']
formula = """
yt~C(platform)+C(channel)+C(gender)+C(income)
  +C(platform):C(channel)+C(platform):C(gender)+C(platform):C(income)+C(channel):C(gender)+C(channel):C(income)+C(gender):C(income)
  +C(platform):C(channel):C(gender)+C(platform):C(channel):C(income)+C(platform):C(income):C(gender)+C(income):C(channel):C(gender)
  +C(platform):C(channel):C(gender):C(income)"""
anova_results = sm.stats.anova_lm(ols(formula,df).fit(),typ=1)
print(anova_results)

                                                df       sum_sq     mean_sq  \
C(platform)                                    1.0   359.643524  359.643524   
C(channel)                                     1.0    52.763519   52.763519   
C(gender)                                      1.0     2.980561    2.980561   
C(income)                                      1.0     9.466801    9.466801   
C(platform):C(channel)                         1.0     2.842930    2.842930   
C(platform):C(gender)                          1.0     1.453874    1.453874   
C(platform):C(income)                          1.0    25.251752   25.251752   
C(channel):C(gender)                           1.0     2.229436    2.229436   
C(channel):C(income)                           1.0     5.510846    5.510846   
C(gender):C(income)                            1.0    65.461621   65.461621   
C(platform):C(channel):C(gender)               1.0     1.847600    1.847600   
C(platform):C(channel):C(income)               1.0  

In [9]:
analysis(sql1,'channel','income')

                         df       sum_sq     mean_sq          F        PR(>F)
C(channel)              1.0   137.995226  137.995226  28.368801  1.882956e-07
C(income)               1.0     1.004807    1.004807   0.206566  6.497774e-01
C(channel):C(income)    1.0     0.262256    0.262256   0.053914  8.165344e-01
Residual              323.0  1571.178788    4.864331        NaN           NaN


In [10]:
analysis(sql2,'platform','income')

                          df       sum_sq    mean_sq         F    PR(>F)
C(platform)              1.0    32.255300  32.255300  5.107447  0.024502
C(income)                1.0     1.138997   1.138997  0.180354  0.671357
C(platform):C(income)    1.0    28.372280  28.372280  4.492592  0.034819
Residual               317.0  2001.965051   6.315347       NaN       NaN


In [11]:
analysis(sql3,'gender','income')

                        df       sum_sq   mean_sq         F    PR(>F)
C(gender)              1.0     0.015724  0.015724  0.002784  0.957951
C(income)              1.0     0.463533  0.463533  0.082082  0.774690
C(gender):C(income)    1.0     2.306880  2.306880  0.408500  0.523210
Residual             306.0  1728.043618  5.647201       NaN       NaN


In [12]:
analysis(sql4,'platform','gender')

                          df       sum_sq    mean_sq         F    PR(>F)
C(platform)              1.0    28.277411  28.277411  4.529813  0.034050
C(gender)                1.0     1.004503   1.004503  0.160913  0.688577
C(platform):C(gender)    1.0     7.811053   7.811053  1.251268  0.264124
Residual               330.0  2060.028978   6.242512       NaN       NaN


In [13]:
analysis(sql5,'channel','gender')

                         df       sum_sq     mean_sq          F        PR(>F)
C(channel)              1.0   142.635631  142.635631  30.319421  7.277374e-08
C(gender)               1.0     0.045887    0.045887   0.009754  9.213853e-01
C(channel):C(gender)    1.0     0.269190    0.269190   0.057221  8.110895e-01
Residual              337.0  1585.393314    4.704431        NaN           NaN
