In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import random
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
 
class Smote(object):
    def __init__(self, N=50, k=5, r=2):
        # 初始化self.N, self.k, self.r, self.newindex
        self.N = N
        self.k = k
        # self.r是距离决定因子
        self.r = r
        # self.newindex用于记录SMOTE算法已合成的样本个数
        self.newindex = 0
        
    # 构建训练函数
    def fit(self, samples):
        # 初始化self.samples, self.T, self.numattrs
        self.samples = samples
        # self.T是少数类样本个数，self.numattrs是少数类样本的特征个数
        self.T, self.numattrs = self.samples.shape

        # 查看N%是否小于100%
        if(self.N < 100):
            # 如果是，随机抽取N*T/100个样本，作为新的少数类样本
            np.random.shuffle(self.samples)
            self.T = int(self.N*self.T/100)
            self.samples = self.samples[0:self.T,:]
            # N%变成100%
            self.N = 100

        # 查看从T是否不大于近邻数k
        if(self.T <= self.k):
            # 若是，k更新为T-1
            self.k = self.T - 1

        # 令N是100的倍数
        N = int(self.N/100)
        # 创建保存合成样本的数组
        self.synthetic = np.zeros((self.T * N, self.numattrs))

        # 调用并设置k近邻函数
        neighbors = NearestNeighbors(n_neighbors=self.k+1, 
                                     algorithm='ball_tree', 
                                     p=self.r).fit(self.samples)

        # 对所有输入样本做循环
        for i in range(len(self.samples)):
            # 调用kneighbors方法搜索k近邻
            nnarray = neighbors.kneighbors(self.samples[i].reshape((1,-1)),
                                           return_distance=False)[0][1:]

            # 把N,i,nnarray输入样本合成函数self.__populate
            self.__populate(N, i, nnarray)

        # 最后返回合成样本self.synthetic
        return self.synthetic
    
    # 构建合成样本函数
    def __populate(self, N, i, nnarray):
        # 按照倍数N做循环
        for j in range(N):
            # attrs用于保存合成样本的特征
            attrs = []
            # 随机抽取1～k之间的一个整数，即选择k近邻中的一个样本用于合成数据
            nn = random.randint(0, self.k-1)
            
            # 计算差值
            diff = self.samples[nnarray[nn]] - self.samples[i]
            # 随机生成一个0～1之间的数
            gap = random.uniform(0,1)
            # 合成的新样本放入数组self.synthetic
            self.synthetic[self.newindex] = self.samples[i] + gap*diff

            # self.newindex加1， 表示已合成的样本又多了1个
            self.newindex += 1


In [4]:
samples = np.array(pd.read_csv(r'D:\研二寒假\课题\课题数据集\鲍鱼数据集\鲍鱼train_分层.csv', error_bad_lines=False, lineterminator="\n", encoding="gbk",header=None))

In [5]:
smote = Smote(N=8.4)
synthetic_points1 = smote.fit(samples)
print(synthetic_points1)


[[ 0.6581173   0.5168827   0.18       ...  0.33800616  0.55029326
  16.37653936]
 [ 0.625       0.48229406  0.15976473 ...  0.20720594  0.33205879
   9.        ]
 [ 0.31398806  0.22921293  0.07955025 ...  0.03083134  0.04417393
   5.        ]
 ...
 [ 0.52412818  0.40895381  0.15821709 ...  0.15458153  0.25164781
  16.67829096]
 [ 0.47164275  0.35828549  0.16028705 ...  0.10647847  0.14590031
   8.        ]
 [ 0.3203483   0.23901122  0.07633707 ...  0.03921685  0.04556179
   6.        ]]


In [8]:
np.savetxt(r'D:\研二寒假\课题\课题数据集\鲍鱼数据集\SMOTE\100.csv', synthetic_points1, delimiter = ',') 