In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
from numpy.linalg import svd
from scipy import stats

In [2]:
# statistic 

def std(a):
    ar = np.array(a)
    std = np.std(ar)
    print("std is:",std)
    return std

def variance(a):
    ar = np.array(a)
    v = np.var(ar)
    print("var is:",v)
    return v

def mode(a):
    return stats.mode(a)[0][0]

def stat(data):
    n = len(data)
    q1 = math.floor(n*0.25)
    q3 = math.floor(n*0.75)
    q1_value=data[q1]
    q3_value=data[q3]
    iqr=q3_value-q1_value
    result ={
        "median":np.median(data),
        "mode":mode(data),
        "MIN":np.min(data),
        "MAX":np.max(data),
        "Q1":q1_value,
        "Q3":q3_value,
        "IQR":iqr,
    }

    return result
    
def outlier_det(data):
    sta = stat(data)
    iqr = sta["IQR"]
    min_edge = sta["Q1"]-1.5*iqr
    max_edge = sta["Q3"]+1.5*iqr
    result=[]
    print("最小边缘是:",min_edge)
    print("最大边缘是:",max_edge)
    for i in data:
        if i < min_edge or i > max_edge:
            result.append(i)
    print("outliers 有:",result)
    return result


In [3]:
# normalization: min-max,z-score,decimal scaling
def min_max(v,o_min,o_max,n_min,n_max):
    result = (((v-o_min)*(n_max-n_min))/(o_max-o_min))+n_min
    return result

def z_score(data,m=None,s=None):
    if m:
        me = m
    else:
        me = np.mean(data)
    if s:
        st = s
    else:
        st = std(data)
    result =[]
    for i in data:
        result.append(round((i-me)/st,3))

    return result

def decimal_scaling(data):
    result=[]
    maximum = np.max(data)
    bits = round(np.log10(maximum))
    for d in data:
        result.append(d/(10**bits))
    return result

data = [100,400,600,800,3000,4200]

# min_max(73000,12000,98000,0,1)
print(z_score(data))
print(decimal_scaling(data))

std is: 1527.9797846248562
[-0.927, -0.731, -0.6, -0.469, 0.971, 1.756]
[0.01, 0.04, 0.06, 0.08, 0.3, 0.42]


In [4]:
# binning equal-depth,equal-width
def equal_depth(data,num):
    n = len(data)
    each = n//num
    result=[]
    i=0
    while i<=n-4:
        result.append(data[i:i+each])
        i+=each
    print(result)
    return result

def equal_width(data,num):
    width = (np.max(data)-np.min(data))//num
    print("width:",width)
    edge=np.min(data)+width
    result=[[] for _ in range(num)]
    for i in range(num):
        for j in data:
            if edge>=j:
                result[i].append(j)
            else:
                edge = edge+width
                break
            
    return result

data = [8, 10, 15, 35, 50, 52, 85, 89, 92, 158, 201, 251]    
equal_width(data,3)

width: 81


[[8, 10, 15, 35, 50, 52, 85, 89],
 [8, 10, 15, 35, 50, 52, 85, 89, 92, 158],
 [8, 10, 15, 35, 50, 52, 85, 89, 92, 158, 201, 251]]

In [5]:
# distance & similarity
def Euclidean(x,y):
    x=np.array(x)
    y=np.array(y)
    distance = np.linalg.norm(np.subtract(x,y))
    return distance

def Manhattan(x,y):
    x=np.array(x)
    y=np.array(y)
    distance = np.linalg.norm(np.subtract(x,y),ord=1)
    return distance
    
def Supremum(x,y):
    x=np.array(x)
    y=np.array(y)
    distance = np.linalg.norm(np.subtract(x,y),ord=np.inf)
    return distance

def Minkowski(x,y,o):
    x=np.array(x)
    y=np.array(y)
    distance = np.linalg.norm(np.subtract(x,y),ord=o)
    return distance


def sim_mat(data):
    sm = [[0 for _ in range(len(data))] for _ in range(len(data))]
    dm = dis_mat(data,data)
    for i in range(len(data)):
        for j in range(i):
            sm[i][j]=1-dm[i][j]
    return sm
            

def dis_mat(data1,data2, dis_type=2,s=False):
    dm = [[0 for _ in range(len(data2))] for _ in range(len(data1))]
    for i in range(len(data1)):
        for j in range(i):
            if dis_type==1:
                dm[i][j]=round(Manhattan(data1[i],data2[j]),3)
            elif dis_type==2:
                dm[i][j]=round(Euclidean(data1[i],data2[j]),3)
            elif dis_type==3:
                dm[i][j]=round(Supremum(data1[i],data2[j]),3)
                
            if s:
                dm[j][i]=dm[i][j]
    return dm

def print_dm(dm):
    for i in dm:
        print(i)
    return True

def SMC(x,y):
    l = len(x)
    f_11=f_01=f_10=f_00=0
    for i in range(l):
        if x[i]==y[i] and x[i]==1:
            f_11+=1
        elif x[i]==y[i] and x[i]==0:
            f_00+=1
        else:
            if x[i]==0:
                f_01+=1
            else:
                f_10+=1
    similarity = (f_11+f_00)/(f_11+f_01+f_10+f_00)
    return similarity

def Jaccard(x,y):
    l = len(x)
    f_11=f_01=f_10=f_00=0
    for i in range(l):
        if x[i]==y[i] and x[i]==1:
            f_11+=1
        elif x[i]==y[i] and x[i]==0:
            f_00+=1
        else:
            if x[i]==0:
                f_01+=1
            else:
                f_10+=1
    similarity = (f_11)/(f_11+f_01+f_10)
    return similarity
    
    
def L2(v):
    return round(np.linalg.norm(v),3)

def cos_sim(v1,v2,d=3):
    top = dot_product(v1,v2)
    print("\n分子:",top)
    bottom = round(L2(v1)*L2(v2),d)
    print("分母:",bottom)
    result = round(top/bottom,d)
    print("cosine similarity:",result)
    return result

def dot_product(a,b):
    a=np.array(a)
    b=np.array(b)
    result = round(np.dot(a,b),3)
    return result

def EJ(x,y):
    top = dot_product(x,y)
    print("\n分子:",top)
    bottom = round(L2(x)+L2(y)-top,3)
    print("分母:",bottom)
    result = round(top/bottom,3)
    print("Extend Jaccard:",result)
    return result
    
def pearson(x,y):
    return np.corrcoef(x,y)

def correlation(x,y):
    n = len(x)
    r=0
    s_x=0
    s_y=0
    for i in range(n):
        r+=(x[i]-np.mean(x))*(y[i]-np.mean(y))
        s_x+=np.square(x[i]-np.mean(x))
        s_y+=np.square(y[i]-np.mean(y))
    cov = r/(n-1)
    print(cov)
    std_x = np.sqrt(s_x/(n-1))
    print(std_x)
    std_y = np.sqrt(s_y/(n-1))
    print(std_y)
    result = cov/(std_x*std_y)
    return result

def kevin_mean(x):
    r = []
    for i in x:
        if i != 0:
            r.append(i)
    result = np.mean(r)
    return result
    
def kevin_pearson(x,y):
    
    m1 = kevin_mean(x)
    m2 = kevin_mean(y)
    for i in range(len(x)):
        if x[i] !=0:
            x[i]=round(np.subtract(x[i],m1),1)

    for i in range(len(y)):
        if y[i] !=0:
            y[i]=round(np.subtract(y[i],m2),1)

    result = round(cos_sim(x,y),2)
    return result


def item_item(sims,rates):
    r = np.dot(sims,rates)
    result = r/np.sum(sims)
    return round(result,2)



# pearson(x,y)
# correlation(x,y)
# np.mean(x)

a = [4,0,0,5,1,0,0]
b = [0,0,0,2,4,5,0]
EJ(a,b)



分子: 14
分母: -0.811
Extend Jaccard: -17.263


-17.263

In [6]:
# entropy

def entropy(x):
    n = len(x)
    r=0
    s = np.sum(x)
    for i in range(n):
        p = x[i]/s
        r+=p*np.log2(p)
    result=round(-1*r,3)
    return result

# GINI index最小的切割
def gini(x):
    n = len(x)
    r = 0
    s = np.sum(x)
    for i in range(n):
        p = x[i]/s
        r+=np.square(p)
        
    
    result = round(1-r,3)
    return result


def error(x):
    s = np.sum(x)
    n = len(x)
    for i in range(n):
        x[i]/=s
    result = round(1-np.max(x),3)
    return result

# print(((4/20)*gini([1,3])+(8/20)*gini([1,0])+(8/20)*gini([1,7])))

In [7]:
# TF-IDF
def TF(v):
    tf = []
    for f in v:
        tf.append(round(f/np.max(v),3))
    print("normalized-tf:",tf)
    return tf

def IDF(N,DF):
    idf = []
    for df in DF:
        idf.append(round(np.log10(N/df),3))
    print("IDF:",idf)
    return idf

# 这里只计算了一行的tf-idf
def TF_IDF(v,df,N):
    tf = TF(v)
    idf = IDF(N,df)
    result=[]
    for i in range(len(df)):
        result.append(tf[i]*idf[i])
    print("tf-idf:",result)
    return result

def smoothing(f,dj,l):
    length=len(f)
    top = np.add(l , f)
    bottom = length*l+dj
    result = top/bottom
    return result


def str2index(data):
    n=len(data)
    m = []
    vocab={}
    for i in range(n):
        for j in data[i]:
            if j not in m:
                m.append(j)
                
    for i in range(len(m)):
        vocab[m[i]]=i
    print(vocab)
    result=[[0 for _ in range(len(m))]for _ in range(n)]
    
    for i in range(n):
        for j in data[i]:
            result[i][vocab[j]]+=1
            
    return result
            

In [8]:
# Simplified PageRank by Power Iteration Method
# iteration 是迭代次数

def PageRank_PI(A,r,iteration):
    
    for i in range(len(A)):
        A[i]=np.multiply(A[i],1/(np.sum(A[i])))
    
    M = np.array(A).T

    print(M)
    print(f"This iteration 0,new Page Rank is :\n{r}")
    r= np.array(r)
    for i in range(iteration):
        r= np.around(np.dot(M,r),4)
        print(f"This iteration{i+1},new Page Rank is :\n{r}")
        
    return r
    




In [9]:
# linear algebra
def Cov_matrix(X):
    mean = np.mean(X,axis=1)
    cov_m = np.cov(X)
    return cov_m

def EigDec(X):
    val,vec = np.linalg.eig(X)
    print("eigen value are:\n",val)
    print("max",max(val))
    print("eigen vector are:\n",vec)
    return val,vec

def PCA(X):
    m = Cov_matrix(X)
    val,vec = EigDec(m)
    return val,vec

def simple_svd(X):
    U,s,VT = svd(X)
    print("\n X = \n",X)
    print("\n U = \n",U)
    print("\n s = \n",s)
    print("\n VT = \n",VT)
    recovered_X = U.dot(np.diag(s)).dot(VT)
    return recovered_X

def kevin_rank(X):
    result=np.linalg.matrix_rank(X)
    return result


In [15]:
# 数据统计相关计算模板
a = [0, 5, 14, 16, 17, 19, 19, 19, 22, 30, 50]
print(stat(a))
print(std(a))
print(variance(a))
outlier_det(a)
mode(a)

{'median': 19.0, 'mode': 19, 'MIN': 0, 'MAX': 50, 'Q1': 14, 'Q3': 22, 'IQR': 8}
std is: 12.379001147971818
12.379001147971818
var is: 153.2396694214876
153.2396694214876
最小边缘是: 2.0
最大边缘是: 34.0
outliers 有: [0, 50]


19

In [11]:
# 相似度计算模板

In [14]:
# item-item推荐系统计算模板


x = [1,0,3,0,0,5,0,0,5,0,4,0]
y = [2,4,0,1,2,0,3,0,4,3,5,0]
z = [1,0,3,0,3,0,0,2,0,0,4,0]

sims=[kevin_pearson(x,y),kevin_pearson(x,z)]
print(sims)
rates=[2,3]
item_item(sims,rates)



分子: 4.8
分母: 11.594
cosine similarity: 0.414

分子: 4.48
分母: 7.631
cosine similarity: 0.587
[0.41, 0.59]


2.59