<a href="https://colab.research.google.com/github/wyjustin/helloWorld/blob/master/ML_homework_1_well.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 最原始的python 版本

In [None]:
import time
import pandas as pd
import numpy as np

def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result


In [None]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

start_time = time.time()
result = target_mean_v1(data, 'y', 'x')
end_time = time.time()
    
print("Python total time:", (end_time - start_time))
print("Python result:")
print(result)
print("\n")

start_time = time.time()
result = target_mean_v2(data, 'y', 'x')
end_time = time.time()
    
print("Python V2 total time:", (end_time - start_time))
print("Python V1 result:")
print(result)

V1 total time: 24.06008768081665
V1 result:
[0.50867052 0.51282051 0.54761905 ... 0.51896208 0.5204918  0.50094877]


V2 total time: 0.3091433048248291
V1 result:
[0.50867052 0.51282051 0.54761905 ... 0.51896208 0.5204918  0.50094877]


# 课堂上cython的版本

In [None]:
%reload_ext cython

In [None]:
%%cython

import numpy as np
cimport numpy as cnp

cpdef target_mean_v3(data, y_name, x_name):
    cdef long nrow = data.shape[0]
    cdef cnp.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
    cdef cnp.ndarray[double] x = np.asfortranarray(data[x_name], dtype=np.float64)
    cdef cnp.ndarray[double] y = np.asfortranarray(data[y_name], dtype=np.float64)
    
    cdef dict value_dic = dict()
    cdef dict count_dic = dict()

    for i in range(nrow):
      if x[i] not in value_dic.keys():
        value_dic[x[i]] = y[i]
        count_dic[x[i]] = 1
      else:
        value_dic[x[i]] += y[i]
        count_dic[x[i]] += 1
    for i in range(nrow):
      result[i] = (value_dic[x[i]] - y[i]) / (count_dic[x[i]] - 1)
    
    return result

In [None]:
start_time = time.time()
result = target_mean_v3(data, 'y', 'x')
end_time = time.time()

print("total time:", (end_time - start_time))
print(result)

total time: 0.006345510482788086
[0.50867052 0.51282051 0.54761905 ... 0.51896208 0.5204918  0.50094877]


# 个人优化的几个版本

1. 继续使用dict，做如下优化的版本（v1）
  - 数据类型根据题目要求可以指定为int
  - range函数替换

2. 替换dict为C++的unordered_map的版本
  - 数据类型根据题目要求可以指定为int
  - range函数替换和不替换（v2和v3两种），两个差不多，v3版本一般情况下比v2快

3. 尝试使用pypm的并行，不作为作业提交的版本
  - 发现在简单任务，而且免费colab的2个cpu的情况下，并行效果不佳，反而比v2和v3还要慢

所以快的原因总结下：
1. int类型计算时比double的快，与计算机的存储方式和计算方式有关
2. 更加精简的直接的命令比封装的命令要快，相当于C++里面的inline的效果


In [None]:
%reload_ext cython

In [None]:
!pip install pymp-pypi



In [None]:
! cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2200.190
cache size	: 56320 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa
bogomips	: 4400.38
clflush size	: 64
cache_alignment	: 64
address sizes	: 46 bits physical, 48 b

In [None]:
%%cython --cplus

import time
import pandas as pd
import numpy as np
cimport numpy as cnp
from libcpp.unordered_map cimport unordered_map
from cython cimport parallel
import pymp

cpdef target_mean_v_teacher(data, y_name, x_name):
    cdef long nrow = data.shape[0]
    cdef cnp.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
    cdef cnp.ndarray[double] x = np.asfortranarray(data[x_name], dtype=np.float64)
    cdef cnp.ndarray[double] y = np.asfortranarray(data[y_name], dtype=np.float64)
    
    cdef dict value_dic = dict()
    cdef dict count_dic = dict()

    for i in range(nrow):
      if x[i] not in value_dic.keys():
        value_dic[x[i]] = y[i]
        count_dic[x[i]] = 1
      else:
        value_dic[x[i]] += y[i]
        count_dic[x[i]] += 1
    for i in range(nrow):
      result[i] = (value_dic[x[i]] - y[i]) / (count_dic[x[i]] - 1)
    
    return result

cpdef target_mean_v_new1(data, y_name, x_name):
    cdef int nrow = data.shape[0]
    cdef cnp.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
    cdef cnp.ndarray[int] x = np.asfortranarray(data[x_name], dtype=np.int32)
    cdef cnp.ndarray[int] y = np.asfortranarray(data[y_name], dtype=np.int32)
    
    cdef dict value_dic = dict()
    cdef dict count_dic = dict()

    for i from 0 <= i < nrow by 1:
      if x[i] not in value_dic.keys():
        value_dic[x[i]] = y[i]
        count_dic[x[i]] = 1
      else:
        value_dic[x[i]] += y[i]
        count_dic[x[i]] += 1
    for i from 0 <= i < nrow by 1:
      result[i] = (value_dic[x[i]] - y[i]) / (count_dic[x[i]] - 1)
    
    return result

cpdef target_mean_v_cplus(data, y_name, x_name):
    cdef int nrow = data.shape[0]
    cdef cnp.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
    cdef cnp.ndarray[int] x = np.asfortranarray(data[x_name], dtype=np.int32)
    cdef cnp.ndarray[int] y = np.asfortranarray(data[y_name], dtype=np.int32)
    
    cdef unordered_map[int, int] value_map
    cdef unordered_map[int, int] count_map

    for i in range(nrow):
      if value_map.count(x[i]):
        value_map[x[i]] += y[i]
        count_map[x[i]] += 1
      else:
        value_map[x[i]] = y[i]
        count_map[x[i]] = 1

    for i in range(nrow):
      result[i] = (value_map[x[i]] - y[i]) / (count_map[x[i]] - 1)
    
    return result

cpdef target_mean_v_cplus_last(data, y_name, x_name):
    cdef int nrow = data.shape[0]
    cdef cnp.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
    cdef cnp.ndarray[int] x = np.asfortranarray(data[x_name], dtype=np.int32)
    cdef cnp.ndarray[int] y = np.asfortranarray(data[y_name], dtype=np.int32)
    
    cdef unordered_map[int, int] value_map
    cdef unordered_map[int, int] count_map

    for i from 0 <= i < nrow by 1:
      if value_map.count(x[i]):
        value_map[x[i]] += y[i]
        count_map[x[i]] += 1
      else:
        value_map[x[i]] = y[i]
        count_map[x[i]] = 1

    for i from 0 <= i < nrow by 1:
      result[i] = (value_map[x[i]] - y[i]) / (count_map[x[i]] - 1)
    
    return result

cpdef target_mean_v_cplus_test_pymp(data, y_name, x_name):
    cdef int nrow = data.shape[0]
    cdef cnp.ndarray[double] result = pymp.shared.array((nrow,), dtype=np.float64)
    cdef cnp.ndarray[int] x = np.asfortranarray(data[x_name], dtype=np.int32)
    cdef cnp.ndarray[int] y = np.asfortranarray(data[y_name], dtype=np.int32)
    
    cdef unordered_map[int, int] value_map
    cdef unordered_map[int, int] count_map

    for i from 0 <= i < nrow by 1:
      if value_map.count(x[i]):
        value_map[x[i]] = value_map[x[i]] + y[i]
        count_map[x[i]] += 1
      else:
        value_map[x[i]] = y[i]
        count_map[x[i]] = 1

    with pymp.Parallel(2) as p:
      for i in p.range(0, nrow):
        result[i] = (value_map[x[i]] - y[i]) / (count_map[x[i]] - 1)
    
    return result



In [None]:
start_time = time.time()
result_ori = target_mean_v_teacher(data, 'y', 'x')
end_time = time.time()

print("teacher's version total time:", (end_time - start_time))
print(result)

start_time = time.time()
result_new1 = target_mean_v_new1(data, 'y', 'x')
end_time = time.time()

print("new1 version's total time:", (end_time - start_time))
print(result)

start_time = time.time()
result_cplus = target_mean_v_cplus(data, 'y', 'x')
end_time = time.time()

print("cplus version's total time:", (end_time - start_time))
print(result)

start_time = time.time()
result_cplus_last = target_mean_v_cplus_last(data, 'y', 'x')
end_time = time.time()

print("cplus last version's total time:", (end_time - start_time))
print(result)

start_time = time.time()
result_cplus_last_pymp = target_mean_v_cplus_last_pymp(data, 'y', 'x')
end_time = time.time()

print("cplus last add version's total time:", (end_time - start_time))
print(result)

diffv1 = np.linalg.norm(result_new1 - result_ori)
diffv2 = np.linalg.norm(result_cplus - result_ori)
diffv3 = np.linalg.norm(result_cplus_last - result_ori)
diffv4 = np.linalg.norm(result_cplus_last_pymp - result_ori)
print("v1's diff:", diffv1)
print("v2's diff:", diffv2)
print("v3's diff:", diffv3)
print("v4's diff:", diffv4)

teacher's version total time: 0.05143594741821289
[0.50867052 0.51282051 0.54761905 ... 0.51896208 0.5204918  0.50094877]
new1 version's total time: 0.03164982795715332
[0.50867052 0.51282051 0.54761905 ... 0.51896208 0.5204918  0.50094877]
cplus version's total time: 0.011758565902709961
[0.50867052 0.51282051 0.54761905 ... 0.51896208 0.5204918  0.50094877]
cplus last version's total time: 0.012859344482421875
[0.50867052 0.51282051 0.54761905 ... 0.51896208 0.5204918  0.50094877]


FileNotFoundError: ignored

# 确保10万数据量级的数据能够在1分钟内跑出来结果

In [None]:
y = np.random.randint(2, size=(100000, 1))
x = np.random.randint(10, size=(100000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [None]:
start_time = time.time()
result_ori = target_mean_v_teacher(data, 'y', 'x')
end_time = time.time()

print("teacher's version total time:", (end_time - start_time))
print(result)

start_time = time.time()
result_new1 = target_mean_v_new1(data, 'y', 'x')
end_time = time.time()

print("new1 version's total time:", (end_time - start_time))
print(result)

start_time = time.time()
result_cplus = target_mean_v_cplus(data, 'y', 'x')
end_time = time.time()

print("cplus version's total time:", (end_time - start_time))
print(result)

start_time = time.time()
result_cplus_last = target_mean_v_cplus_last(data, 'y', 'x')
end_time = time.time()

print("cplus last version's total time:", (end_time - start_time))
print(result)

diffv1 = np.linalg.norm(result_new1 - result_ori)
diffv2 = np.linalg.norm(result_cplus - result_ori)
diffv3 = np.linalg.norm(result_cplus_last - result_ori)
print("v1's diff:", diffv1)
print("v2's diff:", diffv2)
print("v3's diff:", diffv3)

teacher's version total time: 0.054598331451416016
[0.50867052 0.51282051 0.54761905 ... 0.51896208 0.5204918  0.50094877]
new1 version's total time: 0.029440641403198242
[0.50867052 0.51282051 0.54761905 ... 0.51896208 0.5204918  0.50094877]
cplus version's total time: 0.012102842330932617
[0.50867052 0.51282051 0.54761905 ... 0.51896208 0.5204918  0.50094877]
cplus last version's total time: 0.012049436569213867
[0.50867052 0.51282051 0.54761905 ... 0.51896208 0.5204918  0.50094877]
v1's diff: 0.0
v2's diff: 0.0
v3's diff: 0.0
