## 双曲线拟合（以xy=a为主）

本程序用于输入多个散点数据组后，可以对每个数据组拟合，并画出拟合后的图像和原散点图。

拟合的数据组均为双曲线图像

**输入：**每一个关键词的数据点在同一张表的表。表第一列为关键词名字，第二列为x轴值，第三列为y轴值。xy均为整数、

**输出：**整体文件夹，文件夹下两个文件夹。文件夹一包含两种函数分别拟合的图像（全部）。文件夹二包含机器二选一选择出来的函数拟合图像。

In [2]:
#输入包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import leastsq
from scipy import integrate
from pylab import *
import itertools
import pyecharts
from pyecharts.charts import Scatter
from pyecharts import options as opts
from pyecharts.options import TextStyleOpts
import pandas as pd
import os
import re
from scipy.optimize import curve_fit
from pyecharts.commons.utils import JsCode

In [1]:
#计算曲线的整体函数
#计算曲线的整体函数
def hyperbola(list_x,list_y):
    """
    拟合函数为：alpha * (x ** beta)
    注：与下面带gamma的拟合不同在于，这里函数拟合使用了logx和logy
    """
    xdata = np.array(list_x)
    ydata = np.array(list_y)
        
    # Define function for calculating a power law
    powerlaw = lambda x, alpha,  beta: alpha * (x ** beta)
    logx = log10(xdata)
    logy = log10(ydata)
    # define our (line) fitting function
    fitfunc = lambda p, x: p[0] + p[1] * x 
    errfunc = lambda p, x, y: (y - fitfunc(p, x))
    pinit = [1.0,  -1.0]
    out, cov, infodict, mesg, ier = leastsq(errfunc, pinit,
                           args = (logx, logy), full_output=1)
    beta = out[1]
    alpha = 10.0 ** out[0]
    
    ss_err = (infodict['fvec'] ** 2).sum()
    ss_tot = ((ydata - ydata.mean()) ** 2).sum()
    r2 = 1 - (ss_err / ss_tot) 
    x_fit = np.arange(1,1000,1)
    y_fit =  alpha* pow(x_fit, beta)
    
    return np.array(list_x),np.array(list_y),x_fit,y_fit,alpha,beta

def hyperbola_gamma(list_x,list_y):    
    """
    拟合函数为：alpha * (x ** beta) +gamma
    """
    xdata = np.array(list_x)
    ydata = np.array(list_y)
        
    def fitfunc(x,p):           #这里是匹配的函数
        alpha,  beta , gamma = p   #deta, 
        return alpha * (x ** beta) +gamma    #alpha * ((x-deta) ** beta) +gamma
    def errfunc(p,x,y):         #无差函数
        return y - fitfunc(x, p)
    pinit = [1.0, -0.5,0.0]      #参数初始值，有几个参数就有几个初始值
    out, cov, infodict, mesg, ier = leastsq(errfunc, pinit,
                           args = (xdata, ydata), full_output=1)           #最小二乘法匹配 args是数据值。

    alpha = out[0]
    beta = out[1]
    gamma = out[2]
    
    #这些是误差用的 不是很用得上
    ss_err = (infodict['fvec'] ** 2).sum()
    ss_tot = ((ydata - ydata.mean()) ** 2).sum()
    r2 = 1 - (ss_err / ss_tot) 
    
    x_fit = np.arange(1,1000,1)
    y_fit =  alpha * (x_fit ** beta)+gamma #alpha * ((x_fit-deta) ** beta) +gamma

    return np.array(list_x),np.array(list_y),x_fit,y_fit,alpha,beta

def test_draw_main_func(keyword_df,keyword,ax,image_all_path,image_choose_path):
    '''
    把两种函数都画出图像来，人工对比选择哪种函数
    '''
    #数据预处理
    df = keyword_df.copy()
    list_x = df['x'].tolist()
    list_y = df['y'].tolist()
    
    #第一种函数，无gamma
    x,y,x_fit,y_fit,alpha,beta = hyperbola(list_x,list_y) 
    #第二种函数，有gamma
    x2,y2,x2_fit,y2_fit,alpha2,beta2 = hyperbola_gamma(list_x,list_y)
    
    color=next(ax._get_lines.prop_cycler)['color'] 
    def draw(x,y,x_fit,y_fit,keyword,image_all_path):
        #画布设置
        fig2 = plt.figure()
        plt.ylim((0,8000))
        plt.xlim((1,1000))
        fig2.set_figheight(10)
        fig2.set_figwidth(20)
        #绘图
        plot1 = plt.plot(x,y,'o',label = keyword+' original')
        plot2 = plt.plot(x_fit, y_fit, 'r-',label=keyword+' polyfit',color=color)
        plt.xlabel('x_name')
        plt.ylabel('y_name')
        plt.legend(loc=4)  #图例位置
        fig2.savefig(os.path.join(image_all_path,keyword +'.jpg'))
        return 0
    #画出两种函数拟合后的图像放入文件夹一
    draw(x,y,x_fit,y_fit,keyword,image_all_path)
    draw(x2,y2,x2_fit,y2_fit,keyword+'_2',image_all_path)
    
    #这里尝试一下对比方差
    std_df = pd.DataFrame()
    std_df['x'] = pd.Series(x_fit)
    std_df['y'] = pd.Series(y)
    std_df['y_fit'] = pd.Series(y_fit)
    std_df['y2_fit'] = pd.Series(y2_fit)
    std_df['y_fit_std'] = (std_df['y_fit']-std_df['y'])**2
    std_df['y2_fit_std'] = (std_df['y2_fit']-std_df['y'])**2
    
    """
    这里计算同比来看从第几个点开始取值算方差 
    取1.07是我自己对比图像的数据后取得，可以更改。
    """
    for i in range(len(y_fit)):
        if y_fit[i]/y_fit[i+1]<1.07:
            if y2_fit[i]/y2_fit[i+1]<1.07:
                break
    std_df = std_df[(std_df['x']<=400)&(std_df['x']>=i-1)]
    
    def draw_2(x,y,x_fit,y_fit,keyword,image_choose_path):
        #画布设置
        fig2 = plt.figure()
        plt.ylim((0,8000))
        plt.xlim((1,1000))
        fig2.set_figheight(10)
        fig2.set_figwidth(20)
        #绘图
        plot1 = plt.plot(x,y,'o',label = keyword+' original')
        plot2 = plt.plot(x_fit, y_fit, 'r-',label=keyword+' polyfit',color=color)
        plt.xlabel('x_name')
        plt.ylabel('y_name')
        plt.legend(loc=4)  #图例位置  代码和上面一样，只是改了输出路径。问就是懒得改，直接复制粘贴嘿嘿
        fig2.savefig(os.path.join(image_choose_path,keyword +'.jpg'))
        return 0
    num=0
    if std_df['y_fit_std'].sum() < std_df['y2_fit_std'].sum():
        draw_2(x,y,x_fit,y_fit,keyword,image_choose_path)
        alpha_raw = alpha
        beta_raw = beta
        num=1
    else:
        draw_2(x2,y2,x2_fit,y2_fit,keyword+'_2',image_choose_path)
        alpha_raw = alpha2
        beta_raw = beta2
        num=2

#文件夹的创建与路径创建
def check_mkdir(path):
    if not os.path.exists(path):
        os.mkdir(path)
    return path
def set_path(project_mkdir):
    """
    设置文件夹路径
    """
    project_path = check_mkdir(os.path.join(project_mkdir,'fitline_project_'+time.strftime('%Y_%m_%d',time.gmtime(time.time()))))
    image_all_path = check_mkdir(os.path.join(project_path,'image_all'))
    image_choose_path = check_mkdir(os.path.join(project_path,'image_choose'))
    return image_all_path,image_choose_path

In [30]:
#主要程序
def main_func(all_kw,kw_name):
    
    ax = plt.gca()           #不知道这个ax是啥，反正是用来设置颜色让它不重复的
    
    image_all_path,image_choose_path = set_path(project_mkdir)
    
    #每个分类都要做
    for i in range(len(kw_name)):
        keyword = kw_name[i]
        keyword_df = all_kw[all_kw[kw_columns_name]==keyword]
        test_draw_main_func(keyword_df,keyword,ax,image_all_path,image_choose_path)       
    

In [4]:
#输入
#文件存放文件夹
project_mkdir = r'E:/file_mkdir_path'
#所有keyword的读入
all_kw = pd.read_excel(r'E:/keyword_sheet_path.xlsx')
kw_columns_name = kw_columns_name
kw_name_df = all_kw[[kw_columns_name]].copy()
kw_name = kw_name_df.drop_duplicates().tolist()

In [None]:
#主要函数
main_func(all_kw,kw_name)

In [2]:
from pyecharts.charts import Bar
#使用 options 配置项，在 pyecharts 中，一切皆 Options。
from pyecharts import options as opts
# 内置主题类型可查看 pyecharts.globals.ThemeType
from pyecharts.globals import ThemeType

#创建一个柱状图实例
bar = Bar()
#通过add_xaxis方法向柱状图中添加xaxis，即横坐标轴数据，数据类型为python.list. 
bar.add_xaxis(["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"])
#add_yaxis方法则较为不同，第一个参数为要添加的数据系列的名称，第二个参数才是从坐标轴数据。
#(单条数据不需要图例名的时候得用空字符做第一个参数)
bar.add_yaxis("商家A", [5, 20, 36, 10, 75, 90])
bar.add_yaxis("商家B", [15, 6, 45, 20, 35, 66])
#add_yaxis（也就是图例配置项）方法之外的绝大部分的配置项都被放在set_global_opts
bar.set_global_opts(title_opts=opts.TitleOpts(title="主标题", subtitle="副标题"))

#故经上述三个（Bar（），add_xaxis(),add_yaxis()）或更过相关方法便可完善bar实例
#v1版本之后开始支持链式调用
bar = (
    Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
    .add_xaxis(["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"])
    .add_yaxis("商家A", [5, 20, 36, 10, 75, 90])
    .add_yaxis("商家B", [15, 6, 45, 20, 35, 66])
    .set_global_opts(title_opts=opts.TitleOpts(title="主标题", subtitle="副标题"))
)

#渲染图表在notebook显示
bar.render_notebook()