In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

In [None]:
# 随机种子
seed = 2021
# 读取训练数据和测试数据并展示头部信息
train = pd.read_csv('../dataset/train.csv')
test = pd.read_csv('../dataset/test.csv')
# 将数据简单处理：数据切割以及打标签
train_list = []
for items in train.values:
    train_list.append([items[0]] + [float(i) for i in items[1].split(',')] + [items[2]])
train = pd.DataFrame(np.array(train_list))
train.columns = ['id'] + [str(i) for i in range(len(train_list[0]) - 2)] + ['label']
test_list = []
for items in test.values:
    test_list.append([items[0]] + [float(i) for i in items[1].split(',')])
test = pd.DataFrame(np.array(test_list))
test.columns = ['id'] + [str(i) for i in range(len(test_list[0]) - 1)]
# 模型数据集准备
x_train = train.drop(['id', 'label'], axis=1)
y_train = train['label']
x_test = test.drop(['id'], axis=1)

# data['label'] = range(80000, 80000 + data.shape[0])

# 使用数据间的余弦相似度扩展数据集

In [None]:
from scipy.spatial.distance import pdist, squareform
from scipy.spatial.distance import cosine

data_1 = train[train['label'] == 1]
cosine_distances = pdist(data_1.iloc[:, 1:data_1.shape[1]-1], metric='cosine')

cosine_similarity = 1 - squareform(cosine_distances)

similarity_df = pd.DataFrame(cosine_similarity, index=data_1.index, columns=data_1.index)

similarity_df

In [None]:
# 取出相似度最高的前3个样本，显示最低相似度是多少
tmp = similarity_df.apply(lambda x: x.sort_values(ascending=False).iloc[1:4], axis=1)
sample = tmp.apply(lambda x: x.min(), axis=1).sort_values(ascending=True)
# 画出tmp的分布
# plt.hist(tmp.apply(lambda x: x.min(), axis=1), bins=100)
sample = sample[sample > 0.95]
len(sample), len(sample) / len(data_1)

In [None]:
# 抽取10个样本，将其和余弦相似度最相近的3个样本在同一个坐标系中展示出来
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

sample = sample.index[0:10]

# 余弦相似度最相近的3个样本
for i in sample:
    most_similar = similarity_df.loc[i].sort_values(ascending=False)[1:3].index
    plt.figure(figsize=(10, 10))
    plt.plot(data_1.loc[i][1:data_1.shape[1] - 1])
    plt.title('original')
    for j in range(2):
        plt.plot(data_1.loc[most_similar[j]][1:data_1.shape[1] - 1])
        plt.title('similar_{}'.format(j + 1))
    plt.show()

In [None]:
fussion_df = []
for i in similarity_df.index:
    most_similarity_df = similarity_df[i].sort_values(ascending=False).iloc[1:4].index
    for j in most_similarity_df:
        if similarity_df[i][j] > 0.95:
            new_row = (data_1.loc[i].iloc[1:data_1.shape[1]-1] + data_1.loc[j].iloc[1:data_1.shape[1]-1]) / 2
            fussion_df.append(new_row.values.tolist())
len(fussion_df), len(fussion_df[0])
# 将fussion_df合并到train中
fussion_df = pd.DataFrame(fussion_df)
fussion_df.columns = [str(i) for i in range(len(fussion_df.columns))]
fussion_df['label'] = 1
fussion_df['id'] = range(train.shape[0], train.shape[0] + fussion_df.shape[0])
new_train = pd.concat([train, fussion_df], axis=0)

# 使用变化率的余弦相似度增强数据

In [None]:
# 求data_1的差分
data_1_diff = data_1.iloc[:, 1:data_1.shape[1]-1].diff(axis=1).dropna(axis=1)
cosine_distances_diff = pdist(data_1_diff.iloc[:, 1:data_1_diff.shape[1]-1], metric='cosine')

cosine_similarity_diff = 1 - squareform(cosine_distances_diff)

similarity_df_diff = pd.DataFrame(cosine_similarity_diff, index=data_1_diff.index, columns=data_1_diff.index)

similarity_df_diff

In [None]:
data_1_diff
# 将data_1_diff的id和label补充上
data_1_diff['id'] = data_1['id']
data_1_diff['label'] = data_1['label']

In [None]:
# 取出相似度最高的前3个样本，显示最低相似度是多少
tmp = similarity_df_diff.apply(lambda x: x.sort_values(ascending=False).iloc[1:4], axis=1)
sample = tmp.apply(lambda x: x.min(), axis=1).sort_values(ascending=True)
# 画出tmp的分布
plt.hist(tmp.apply(lambda x: x.min(), axis=1), bins=100)
plt.savefig('diff_hist.png')
sample = sample[sample > 0.95]
print(len(sample), len(sample) / len(data_1))
sample = sample.index[0:10]

In [None]:
# 抽取10个样本，将其和余弦相似度最相近的3个样本在同一个坐标系中展示出来
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg



# 余弦相似度最相近的3个样本
for i in sample:
    most_similar = similarity_df_diff.loc[i].sort_values(ascending=False)[1:4].index
    plt.figure(figsize=(10, 10))
    plt.plot(data_1.loc[i][1:data_1.shape[1] - 1])
    plt.title('original')
    for j in range(3):
        plt.plot(data_1.loc[most_similar[j]][1:data_1.shape[1] - 1])
        plt.title('similar_{}'.format(j + 1))
    plt.show()

In [None]:
fussion_df = []
for i in similarity_df_diff.index:
    most_similarity_df = similarity_df_diff[i].sort_values(ascending=False).iloc[1:4].index
    for j in most_similarity_df:
        if similarity_df_diff[i][j] > 0.95:
            new_row = (data_1.loc[i].iloc[1:data_1.shape[1]-1] + data_1.loc[j].iloc[1:data_1.shape[1]-1]) / 2
            fussion_df.append(new_row.values.tolist())
len(fussion_df), len(fussion_df[0])
# 将fussion_df合并到train中
fussion_df = pd.DataFrame(fussion_df)
fussion_df.columns = [str(i) for i in range(len(fussion_df.columns))]
fussion_df['label'] = 1
fussion_df['id'] = range(train.shape[0], train.shape[0] + fussion_df.shape[0])
new_train = pd.concat([train, fussion_df], axis=0)

In [None]:
# 条形图显示各个类别数量
plt.bar(train['label'].value_counts().index, train['label'].value_counts())
plt.savefig('train_label.png')
plt.show()

# 显示数量
print(train['label'].value_counts())

In [None]:
# 可视化每个类别的均值
np.mean(x_train[y_train==0], axis=0).T.plot(legend='0')
np.mean(x_train[y_train==1], axis=0).T.plot(legend='1')
np.mean(x_train[y_train==2], axis=0).T.plot(legend='2')
np.mean(x_train[y_train==3], axis=0).T.plot(legend='3')
plt.legend(['0', '1', '2', '3'])
# 保存这张图像
plt.savefig('mean.png')

In [None]:
# 从train中随机取100个y_train==样本，画出100个图像
for label in range(4):
    plt.figure(figsize=(24, 8))
    # 获取属于当前类别的样本
    label_samples = x_train[y_train == label]

    # 随机选择100个样本（如果该类别样本少于100个，则选择全部样本）
    sample_indices = np.random.choice(label_samples.index, min(100, len(label_samples)), replace=False)
    samples_to_plot = label_samples.loc[sample_indices]

    for i, idx in enumerate(samples_to_plot.index):
        plt.subplot(10, 10, i + 1)
        plt.plot(samples_to_plot.loc[idx]) # 或者使用其他绘图方法
        plt.xticks([])
        plt.yticks([])
    plt.savefig('{}.png'.format(label))
    plt.show()
    


In [None]:
# 可视化每个类别的均值
np.mean(x_train[y_train==0].diff(axis=1), axis=0).T.plot(legend='0')
np.mean(x_train[y_train==1].diff(axis=1), axis=0).T.plot(legend='1')
np.mean(x_train[y_train==2].diff(axis=1), axis=0).T.plot(legend='2')
np.mean(x_train[y_train==3].diff(axis=1), axis=0).T.plot(legend='3')
plt.legend(['0', '1', '2', '3'])
plt.savefig('mean_diff.png')

In [None]:

# 可视化每个类别的累加
np.mean(x_train[y_train==0].cumsum(axis=1), axis=0).T.plot(legend='0')
np.mean(x_train[y_train==1].cumsum(axis=1), axis=0).T.plot(legend='1')
np.mean(x_train[y_train==2].cumsum(axis=1), axis=0).T.plot(legend='2')
np.mean(x_train[y_train==3].cumsum(axis=1), axis=0).T.plot(legend='3')
plt.legend(['0', '1', '2', '3'])

In [None]:
x_train_diff = x_train.diff(axis=1).fillna(0)
x_train__merged = pd.concat([x_train, x_train_diff], axis=1)
x_train__merged.shape