In [1]:
# -*- coding: UTF-8 -*-
#按比例抽样
# 保证脚本与Python3兼容
from __future__ import print_function

import os   #读取数据文件
import sys
 
  
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import warnings

warnings.filterwarnings("ignore")


def readData(path):
    """
    使用pandas读取数据
    """
    data = pd.read_csv(path)
    cols = list(data.columns.values)
    return data[cols]

def split_data(observed_data, split_ratio=(8, 2, 1)):
    """
    划分数据集为训练、验证、测试三个部分
    :param observed_data:
    :param split_ratio: relative proportion among train,evaluation,test
    :return: train, evaluation, test data
    """
    total = split_ratio[0] + split_ratio[1] + split_ratio[2]
    length = len(observed_data)
    train_cnt = int((split_ratio[0] / total) * length)
    test_cnt = int((split_ratio[2] / total) * length)
    return observed_data[:train_cnt], observed_data[train_cnt:-test_cnt], observed_data[-test_cnt:]

if __name__ == "__main__":
    # 设置显示格式
    pd.set_option('display.width', 1000)
    homePath = os.path.dirname(os.path.abspath('__file__'))
    # Windows下的存储路径与Linux并不相同
    if os.name == "nt":
        dataPath = "%s\\train.csv" % homePath
    else:
        dataPath = "%s/train.csv" % homePath
    df = readData(dataPath)

    val = pd.DataFrame()
    train = pd.DataFrame()
    
    tags = df['label'].unique().tolist() #按照标签进行等比例抽取
    
    for tag in tags:
        #抽取20%数据
        data = df[df['label']==tag]
        sample = data.sample(int(0.2*len(data)))
        sample_index = sample.index
        #剩余数据
        all_index = data.index
        residue_index = all_index.difference(sample_index)
        residue = data.loc[residue_index]
        
        val = pd.concat([val,sample],ignore_index=True)
        train = pd.concat([train,residue],ignore_index=True)
        
    val.to_csv('val1.csv',index=0)
    train.to_csv('train1.csv',index=0)
    
    print(len(df))
    print(len(val))
    print(len(train))

322757
64549
258208


In [2]:
print(train['label'].value_counts())

7C26FADD409BD4B9    226325
816A9BEBED2D7C99     14250
0F2E4CC10EDBE80F      8634
56AFA2A526F96CC9      7164
C7E2941B65C6CCD6      1835
Name: label, dtype: int64


In [3]:
from collections import Counter
# 查看所生成的样本类别分布，0和1样本比例9比1，属于类别不平衡数据
print(Counter(train['label']))
# Counter({0: 900, 1: 100})

Counter({'7C26FADD409BD4B9': 226325, '816A9BEBED2D7C99': 14250, '0F2E4CC10EDBE80F': 8634, '56AFA2A526F96CC9': 7164, 'C7E2941B65C6CCD6': 1835})
