# ai do smoking  base on naive bayes

## 基本想法
> 1. 将人类知识转化为先验 beta 分布  
> 2. 将文件的先验分布作为真实的提交标签，使用 naive bayes 去作估计决策 
> 3. 根据决策的结果的反馈和评价来更新先验 beta 分布  
>      
> 实现思路：   
> 1. agent一开始就是个白痴，什么都不知道，只能去随便猜, 然后等待人类的反馈，以及猜的结果   
> 2. 如果人类给出了反馈，人类给出的反馈比较准确，agent应该把此次输入和反馈数据复制一份到可信的提交分布列表中，作为估计的可信来源,以及作为标准的<input-output>数据列表  
> 3. 猜的结果这个反馈，由于不是直接反馈，没有人类给出的反馈的可信度高，这个就停留在一般的提交分布列表中， 作为估计的一般可信来源  
> 4. 如果人类给出了关于代码的先验知识，那么这些先验知识放在知识列表中，作为估计的直接准确来源  
> 5. 如果没有反馈，还没有先验知识，那么agent会一直都是个白痴   

### step1 导入相关库

In [1]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import sqlite3

import sys
import os
import random
import shutil


### step2 生成测试代码目录

In [110]:
def rand_name(types="dir"):
    # 随机生成
    name_len_max = 10
    rand_source = "".join([chr(i) for i in list(range(97, 122))]) + "_"
    rand_name = "".join(random.sample(rand_source, random.randint(3,name_len_max)))
    if types == "file":
        rand_suffix = [".h", ".cpp", ".c", ".txt"]
        rand_name += random.choice(rand_suffix)
    return rand_name

def generate_rand_files():
    file_nums_max = 1
    for _ in range(random.randint(1,file_nums_max)):
        file_name = rand_name("file")
        content = """
#include <iostream>
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctime>
#include <algorithm>
using namespace std;

const int nmax=10000000;
int a[nmax];

void permute(int* a, int n)
{
    for (int i = n; i > 0; i--)swap(a[i-1], a[rand()%i]);
}

void printa(const int* a, int n)
{
    for (int i = 0; i < 10; i++)printf("%d\n", a[i]);
}

int main(){
    n = 10;
    for (int i = 0; i < n; i++)a[i] = i; 
    permute(a, n);
    printa(a, n);
    return 0;
}
        """
        fd = os.open(file_name, os.O_RDWR | os.O_CREAT)
        os.write(fd, str.encode(content))
        os.close(fd)
    pass
    
def generate_rand_code_dir(code_dir_name="code-smoking", depth=0):
    # 随机生成代码目录
    depth_max = 5
    generate_file_prob = 0.4
    dir_nums_max = 5
    if depth > depth_max:
        return
    origin_path = os.getcwd()
    os.mkdir(code_dir_name)
    os.chdir(code_dir_name)
    for _ in range(random.randint(2,dir_nums_max)):
        if random.random() < generate_file_prob or depth == depth_max:
            generate_rand_files()
        else:
            dir_name = rand_name(types="dir")
            generate_rand_code_dir(dir_name, depth + 1)
    os.chdir(origin_path)
    return 


In [114]:
# main-test
print(os.getcwd())
if os.path.exists("code-smoking"):
    shutil.rmtree("code-smoking")
generate_rand_code_dir(code_dir_name="code-smoking", depth=0)
print(os.getcwd())

D:\lxb\lxber-code\code\ai-do\ai-do-ci-in-smoking
D:\lxb\lxber-code\code\ai-do\ai-do-ci-in-smoking


### setp 准备用于存储的数据库

In [2]:
class SqliteDb(object):
    def __init__(self, db_name):
        self.db_name = db_name
        self.conn = None
        self.cursor = None
        pass
    
    def init_db(self):
        self.conn = sqlite3.connect(self.db_name)
        self.conn.row_factory = self.dict_factory
        self.cursor = self.conn.cursor()
        pass

    def exec_sql(self, sql=""):
        result = None
        self.cursor.execute(sql)
        if 'select' in sql.lower():
            result = self.cursor.fetchall()
        self.conn.commit()
        return result
    
    def close_db(self):
        self.cursor.close()
        self.conn.close()
    
    def dict_factory(self, cursor, row):  
        d = {}  
        for idx, col in enumerate(cursor.description):  
            d[col[0]] = row[idx]  
        return d 
    

In [3]:
# main-test
db = SqliteDb("test.db")
db.init_db()

In [4]:
sql_create_code_struct = """
create table code_struct 
(
    id integer primary key , 
    path varchar(255) unique , 
    is_file int 
);
"""
sql_create_priori_knowledge = """
create table priori_knowledge 
(
    id integer primary key , 
    path varchar(255) unique , 
    is_file int , 
    feature varchar(50) 
);
"""
sql_create_priori_distribution = """
create table priori_distribution 
(
    id integer primary key , 
    path varchar(255) unique ,  
    feature varchar(50) 
);
"""
sql_create_estimate_distribution = """
create table estimate_distribution 
(
    id integer primary key , 
    path varchar(255) unique , 
    feature varchar(50) , 
    mts_a real , 
    mts_b real , 
    mts_mean real , 
    signal_a real , 
    signal_b real , 
    signal_mean real , 
    rct_a real , 
    rct_b real , 
    rct_mean real , 
    kpi_a real , 
    kpi_b real , 
    kpi_mean real  
);
"""
sql_create_commit_history = """
create table commit_history 
(
    id integer primary key , 
    commit_id varchar(255) unique , 
    time datetime , 
    branch varchar(50) , 
    author varchar(50) , 
    comment text , 
    path varchar(255) , 
    estimate_feature varchar(50) , 
    feedback_human varchar(50) , 
    feedback_result varchar(50) 
);
"""
sql_create_human_feedback_distribution = """
create table human_feedback_distribution 
(
   id integer primary key , 
   commit_id varchar(255) unique , 
   time datetime , 
   branch varchar(50) , 
   author varchar(50) , 
   comment text , 
   path varchar(255) , 
   estimate_feature varchar(50) , 
   human_feedback_feature varchar(50) , 
   estimate_feedback_consistensy int , 
   human_feedback_time datetime 
);
"""
sql_create_freq_distance = """
create table commit_distribution 
(
    id integer primary key , 
    author varchar(50), 
    path_base varchar(255) unique , 
    path_join varchar(255) , 
    freq int 
);
"""
sql_insert_knowledge_ = """
insert into priori_knowledge 
(id, name, is_dir, is_file, layer , parent, parent_id, feature, prob_mts, prob_signal, prob_rct, prob_kpi) 
values
(null, 'dts', 1, 0, 1, 'start', 0, 'mts', 1, 0, 0, 0);
"""
sql_insert_knowledge = """
insert into priori_knowledge values 
(null, 'dts', 1, 0, 1, 'start', 0, 'mts', 1, 0, 0, 0);
"""
sql_select_all = "select * from priori_knowledge;"
sql_select_by_name = "select * from priori_knowledge where name='dts';"
sql_delete_table = "drop table priori_knowledge;"


In [12]:
# db.exec_sql(sql_create_priori_knowledge)
# db.exec_sql(sql_insert_knowledge)
# db.exec_sql(sql_select_all)
# db.exec_sql(sql_select_by_name)
# db.exec_sql(sql_delete_table)

[{'feature': 'mts',
  'id': 1,
  'is_dir': 1,
  'is_file': 0,
  'layer': 1,
  'name': 'dts',
  'parent': 'start',
  'parent_id': 0,
  'prob_kpi': 0.0,
  'prob_mts': 1.0,
  'prob_rct': 0.0,
  'prob_signal': 0.0}]

In [13]:
db.close_db()

### step 定义知识对象 和 提交对象

In [25]:
class Knowledge():
    def __init__(path=None, feature=None, is_file=None):
        self.path = path
        self.is_file = is_file
        self.feature = feature
    pass

class Commit():
    def __init__():
        self.commit_id = None
        self.time = None
        self.branch = None
        self.author = None
        self.file_paths = []
        self.comment = ""
        self.estimate_feature = ""
    pass

class Feedback():
    def __init__():
        self.commit_id = None
        self.feature = None
        self.consistensy = 0
    pass

### step 解析先验知识到SqlLite文件

In [86]:

def walk_code_dir(code_path="code-smoking"):
    # 游走代码目录
    index = 0
    for root, dirs, files in os.walk(code_path):
        print (root, dirs, files)
        for dir in dirs:
            index += 1
            dir_path = os.path.join(root, dir)
            yield index, 'dir', root, dir, dir_path
            
        for file in files:
            index += 1
            file_path = os.path.join(root, file)
            yield index, 'file', root, file, file_path
    yield None
    pass

def parse_save_code_struct(code_path="code-smoking", sqlite=None):
    # 解析并保存代码结构
    fg = walk_code_dir(code_path)
    sql_insert_code_struct = """
    insert or ignore into code_struct 
    (id , path, is_file） 
    values 
    (null, code_path, 0)
    """
    while True:
        struct = next(fg)
        if struct is None:
            break
        _, file_type, _, _, path = struct
        is_file = (file_type == 'file' ? 1 : 0)
        sql_insert_code_struct += """
        , (null, '%s', '%s')
        """ % (path, is_file)
    sql_insert_code_struct += ";"
    sqlite.exec_sql(sql_insert_code_struct)
    pass

def update_priori_knowledge_from_human(knowledges, sqlite=None):
    # 更新先验知识
    sql_insert_priori_knowledge = """
    insert or replace into priori_knowledge 
    (id , path, is_file, feature） 
    values 
    """
    for kdg in knowledges:
        path = kdg.path
        is_file = kdg.is_file
        feature = kdg.feature
        sql_insert_priori_knowledge += """
        (null, '%s', '%s', '%s') 
        ,""" % (path, is_file, feature)
    sql_insert_priori_knowledge += sql_insert_priori_knowledge[0:-1] + ";"
    sqlite.exec_sql(sql_insert_priori_knowledge)
    pass

def update_priori_distribution(knowledge=None, sqlite=None):
    # 更新先验分布
    sql_update_priori_distribution = """
    insert or replace into priori_distribution 
    (id ,path ,feature)
    values 
    """
    path = knowledge.path
    is_file = knowledge.is_file
    feature = knowledge.feature
    
    if is_file == 1:
        sql_update_priori_distribution += """
        (null, '%s', '%s');
        """ % (path, feature)
        sqlite.exec_sql(sql_update_priori_distribution)
    else:
        fg = walk_code_dir(path)
        a_feature = feature
        feature_map = {}.update({path: feature})
        while True:
            struct = next(fg)
            if struct is None:
                break
            _, file_type, root, _, path = struct
            is_file = (file_type == 'file' ? 1 : 0)
                
            sql_select_feature_from_priori_knowledge = """
            select feature from priori_knowledge where path='%s';
            """ % (path)
            
            result = sqlite.exec_sql(sql_select_feature_from_priori_knowledge)
            if result is not None and len(result) > 0:
                a_feature = result[0]['feature']
            else:
                a_feature = feature_map[root]
            
            if is_file == 1:
                sql_update_priori_distribution += """
                (null, '%s', '%s')
                ,""" % (path, a_feature)
            else:
                feature_map.update({path: a_feature})
                
    sql_update_priori_distribution += sql_update_priori_distribution[0:-1] + ";"
    sqlite.exec_sql(sql_update_priori_distribution)
    pass

def init_estimate_distribution(code_path="code-smoking", sqlite=None):
    # 从code_sturct直接初始化 estimate_distribution
    sql_insert_estimate_distribution = """
    insert or replace into estimate_distribution 
    (
    id ,path ,feature ,mts_a ,mts_b ,mts_mean ,signal_a ,signal_b , 
    signal_mean ,rct_a ,rct_b ,rct_mean ,kpi_a ,kpi_b ,kpi_mean 
    )
    values 
    """
    fg = walk_code_dir(path)
    while True:
            struct = next(fg)
            if struct is None:
                break
            _, file_type, root, _, path = struct
            is_file = (file_type == 'file' ? 1 : 0)
            
            if is_file == 1:
                sql_insert_estimate_distribution += """
                (
                null ,'%s' ,null ,%s ,%s ,%s ,%s ,%s , 
                %s ,%s ,%s ,%s ,%s ,%s ,%s 
                )
                ,""" % (path ,1 ,1 ,1 ,1 ,1 , 
                        1 ,1 ,1 ,1 ,1 ,1 ,1 
                       )
    sql_insert_estimate_distribution += sql_insert_estimate_distribution[0:-1] + ";"
    sqlite.exe_sql(sql_insert_estimate_distribution)    
    pass


def update_priori_to_estimate_distribution(code_path="code-smoking", sqlite=None):
    # 从priori_distribution更新 estimate_distribution
    sql_insert_estimate_distribution = """
    insert or replace into estimate_distribution 
    (
    path ,mts_a ,mts_b ,signal_a ,signal_b ,rct_a ,rct_b ,kpi_a ,kpi_b 
    )
    values 
    """
    sql_select_from_priori_distribution = """
    select * from priori_distribution;
    """
    priori_distribution = sqlite.exe_sql(sql_select_from_priori_distribution) 
    for prd in priori_distribution:
        path, feature = prd['path'], prd['feature']
        mts_a ,mts_b ,signal_a ,signal_b ,rct_a ,rct_b ,kpi_a ,kpi_b = 1, 1, 1, 1, 1, 1, 1,1
        if feature == "mts":
            mts_a = 11
        if feature == "signal":
            signal_a = 11
        if feature == "rct":
            rct_a = 11
        if feature == "kpi":
            kpi_a = 11
        
        sql_insert_estimate_distribution += """
        (
        %s , %s , %s , %s , %s , %s , %s , %s , %s 
        )
        ,""" % (path , mts_a , mts_b , signal_a , signal_b , rct_a , rct_b , kpi_a , kpi_b)
    sql_insert_estimate_distribution += sql_insert_estimate_distribution[0:-1] + ";"
    sqlite.exe_sql(sql_insert_estimate_distribution)    
    pass

def update_estimate_from_human_feedback(path=None, feature=None):
    # 从人类反馈中更新 estimate_distribution
    sql_insert_estimate_distribution = """
    insert or replace into estimate_distribution 
    (
    path ,mts_a ,mts_b ,signal_a ,signal_b ,rct_a ,rct_b ,kpi_a ,kpi_b 
    )
    values 
    """
    mts_a ,mts_b ,signal_a ,signal_b ,rct_a ,rct_b ,kpi_a ,kpi_b = 1, 1, 1, 1, 1, 1, 1,1
    if feature == "mts":
        mts_a = 11
    if feature == "signal":
        signal_a = 11
    if feature == "rct":
        rct_a = 11
    if feature == "kpi":
        kpi_a = 11
    
    sql_insert_estimate_distribution += """
    (%s , %s , %s , %s , %s , %s , %s , %s , %s )
    ;""" % (path , mts_a , mts_b , signal_a , signal_b , rct_a , rct_b , kpi_a , kpi_b)
    sqlite.exe_sql(sql_insert_estimate_distribution)
    pass

def save_commit(commit=None, sqlite=None):
    # 保存commit
    create table commit_history 
(
    id integer primary key , 
    commit_id varchar(255) unique , 
    time datetime , 
    branch varchar(50) , 
    author varchar(50) , 
    comment text , 
    path varchar(255) , 
    estimate_feature varchar(50) , 
    feedback_human varchar(50) , 
    feedback_result varchar(50) 
);


    pass

def save_human_feedback(commit=None, featrue="", consistensy=0):
    # 保存人类对commit的直接反馈
    
    create table human_feedback_distribution 
(
   id integer primary key , 
   commit_id varchar(255) unique , 
   time datetime , 
   branch varchar(50) , 
   author varchar(50) , 
   comment text , 
   path varchar(255) , 
   estimate_feature varchar(50) , 
   human_feedback_feature varchar(50) , 
   estimate_feedback_consistensy int , 
   human_feedback_time datetime 
);
    self.commit_id = None
        self.time = None
        self.branch = None
        self.author = None
        self.file_paths = []
        self.comment = ""
        self.estimate_feature = ""
    pass

def update_estimate_from_result(commit_id="", feedback="", sqlite=None):
    # 根据commit_id查找相应的估计feature,和相应文件
    # 更新相关文件的反馈值
    
    pass

def get_file_freq_distance_from_commit_hsitory():
    # 从提交历史中获得文件之间的提交频率距离
    pass

def decision_naive_bayes(commit=None):
    # 朴素贝叶斯决策
    # 从esitmate_distribution中得到
    self.commit_id = None
        self.time = None
        self.branch = None
        self.author = None
        self.file_paths = []
        self.comment = ""
        self.estimate_feature = ""
    pass

def update_decision_to_commit():
    # 将贝叶斯的预测结果更新到commit历史表的feature字段
    pass


In [24]:
index = 0
features = ['mts', 'signal', 'rct', 'kpi']
for root, dirs, files in os.walk("code-smoking"):
    print("===========================")
    is_dir = 0
    is_file = 0
    print (root, dirs, files)
    for dir in dirs:
        index += 1
        is_dir = 1
        feature = random.choice(features)
        dir_path = os.path.join(root, dir)
        print (str(index) + " " + str(is_dir) + " " + str(is_file) + " " + feature + " " + dir_path)
    is_dir = 0
    is_file = 0
    for file in files:
        index += 1
        is_file = 1
        feature = random.choice(features)
        file_path = os.path.join(root, file)
        print(str(index) + " " + str(is_dir) + " " + str(is_file) + " " + feature + " " + file_path)

code-smoking ['qlnao', 'vliygmq'] ['cjendxvb.h', 'vedluirh.txt', 'ypr.h']
1 1 0 kpi code-smoking\qlnao
2 1 0 mts code-smoking\vliygmq
3 0 1 kpi code-smoking\cjendxvb.h
4 0 1 mts code-smoking\vedluirh.txt
5 0 1 kpi code-smoking\ypr.h
code-smoking\qlnao ['hcdj', 'lafum', 'vqtndwx'] ['pqxjfhget.cpp']
6 1 0 kpi code-smoking\qlnao\hcdj
7 1 0 kpi code-smoking\qlnao\lafum
8 1 0 signal code-smoking\qlnao\vqtndwx
9 0 1 mts code-smoking\qlnao\pqxjfhget.cpp
code-smoking\qlnao\hcdj ['bgfmxcqo', 'ptae', 'upvcxgki', 'yblucv'] ['tihpr_d.h']
10 1 0 mts code-smoking\qlnao\hcdj\bgfmxcqo
11 1 0 signal code-smoking\qlnao\hcdj\ptae
12 1 0 rct code-smoking\qlnao\hcdj\upvcxgki
13 1 0 rct code-smoking\qlnao\hcdj\yblucv
14 0 1 kpi code-smoking\qlnao\hcdj\tihpr_d.h
code-smoking\qlnao\hcdj\bgfmxcqo ['axroqpmkj', 'bco', 'kfjhgwosy', 'wdef'] []
15 1 0 rct code-smoking\qlnao\hcdj\bgfmxcqo\axroqpmkj
16 1 0 kpi code-smoking\qlnao\hcdj\bgfmxcqo\bco
17 1 0 kpi code-smoking\qlnao\hcdj\bgfmxcqo\kfjhgwosy
18 1 0 kpi code-