In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models import Word2Vec
from dash import Dash, dcc, html
from dash.dependencies import Input, Output
import plotly.express as px

In [10]:
# 設定亂數種子，使每次資料都一樣
np.random.seed(45)

data = {
    'Class': ['A'] * 100 + ['B'] * 200 + ['C'] * 100 + ['D'] * 400 + ['E'] * 400 + ['F'] * 200 + ['G'] * 100,
    'Drink': ['7Up'] * 100 + ['Sprite'] * 200 + ['Pepsi'] * 100 + ['Coke'] * 400 + ['Cappuccino'] * 400 + ['Espresso'] * 200 + ['Latte'] * 100,
    'Rank': [7] * 100 + [6] * 200 + [5] * 100 + [4] * 400 + [3] * 400 + [2] * 200 + [1] * 100,
    'Amount': np.concatenate([
        np.random.normal(loc=100, scale=200, size=100),
        np.random.normal(200, 10, 200),
        np.random.normal(200, 10, 100),
        np.random.normal(400, 100, 400),
        np.random.normal(800, 10, 400),
        np.random.normal(800, 10, 200),
        np.random.normal(900, 400, 100)
    ]),
    'Quantity': np.concatenate([
        np.random.randint(low=500, high=1001, size=800),  # Class A-D
        np.random.randint(1, 501, 700)     # Class E-G
    ])
}
data = pd.DataFrame(data)

In [11]:
# 对数值型特征进行标准化
scaler = StandardScaler()
data[['Rank', 'Amount', 'Quantity']] = scaler.fit_transform(data[['Rank', 'Amount', 'Quantity']])

In [12]:
# One-Hot Encoding
onehot_encoder = OneHotEncoder()
drink_onehot = onehot_encoder.fit_transform(data[['Drink']]).toarray()
data_onehot = pd.concat([data.drop(columns=['Drink']), pd.DataFrame(drink_onehot, columns=onehot_encoder.get_feature_names_out(['Drink']))], axis=1)

In [13]:
# Word2Vec
sentences = data[['Drink']].values.tolist()
sentences = [[word[0]] for word in sentences]
model = Word2Vec(sentences, vector_size=10, window=5, min_count=1, sg=0)
drink_vectors = np.array([model.wv[drink[0]] for drink in sentences])
data_word2vec = pd.concat([data.drop(columns=['Drink']), pd.DataFrame(drink_vectors, columns=[f'Word2Vec_{i}' for i in range(drink_vectors.shape[1])])], axis=1)

In [14]:
# t-SNE
tsne_onehot = TSNE(n_components=2, random_state=45).fit_transform(data_onehot.drop(columns=['Class']))
tsne_word2vec = TSNE(n_components=2, random_state=45).fit_transform(data_word2vec.drop(columns=['Class']))

data_onehot['tsne-x'] = tsne_onehot[:, 0]
data_onehot['tsne-y'] = tsne_onehot[:, 1]
data_word2vec['tsne-x'] = tsne_word2vec[:, 0]
data_word2vec['tsne-y'] = tsne_word2vec[:, 1]

In [15]:
# 创建Dash应用
app = Dash(__name__)

app.layout = html.Div([
    dcc.Tabs([
        dcc.Tab(label='One-Hot Encoding', children=[
            dcc.Graph(
                id='scatter-plot-onehot',
                figure=px.scatter(data_onehot, x='tsne-x', y='tsne-y', color='Class', hover_data=['Class', 'Rank', 'Amount', 'Quantity'])
            ),
            html.Div(id='output-onehot')
        ]),
        dcc.Tab(label='Word2Vec Encoding', children=[
            dcc.Graph(
                id='scatter-plot-word2vec',
                figure=px.scatter(data_word2vec, x='tsne-x', y='tsne-y', color='Class', hover_data=['Class', 'Rank', 'Amount', 'Quantity'])
            ),
            html.Div(id='output-word2vec')
        ])
    ])
])

@app.callback(
    Output('output-onehot', 'children'),
    Input('scatter-plot-onehot', 'selectedData')
)
def display_selected_data_onehot(selectedData):
    if selectedData is None:
        return "No points selected."
    points = selectedData['points']
    indices = [point['pointIndex'] for point in points]
    selected_df = data_onehot.iloc[indices]
    return html.Div([
        html.H4('Selected Points'),
        dcc.Markdown(f'```{selected_df.to_string()}```')
    ])

@app.callback(
    Output('output-word2vec', 'children'),
    Input('scatter-plot-word2vec', 'selectedData')
)
def display_selected_data_word2vec(selectedData):
    if selectedData is None:
        return "No points selected."
    points = selectedData['points']
    indices = [point['pointIndex'] for point in points]
    selected_df = data_word2vec.iloc[indices]
    return html.Div([
        html.H4('Selected Points'),
        dcc.Markdown(f'```{selected_df.to_string()}```')
    ])

if __name__ == '__main__':
    app.run_server(debug=True)

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import plotly.express as px
from dash import Dash, dcc, html
from dash.dependencies import Input, Output

# 数据生成与预处理
np.random.seed(45)
data = {
    'Class': ['A'] * 100 + ['B'] * 200 + ['C'] * 100 + ['D'] * 400 + ['E'] * 400 + ['F'] * 200 + ['G'] * 100,
    'Drink': ['7Up'] * 100 + ['Sprite'] * 200 + ['Pepsi'] * 100 + ['Coke'] * 400 + ['Cappuccino'] * 400 + ['Espresso'] * 200 + ['Latte'] * 100,
    'Rank': [7] * 100 + [6] * 200 + [5] * 100 + [4] * 400 + [3] * 400 + [2] * 200 + [1] * 100,
    'Amount': np.concatenate([
        np.random.normal(loc=100, scale=20, size=100),
        np.random.normal(200, 10, 200),
        np.random.normal(200, 10, 100),
        np.random.normal(400, 100, 400),
        np.random.normal(800, 10, 400),
        np.random.normal(800, 10, 200),
        np.random.normal(900, 400, 100)
    ]),
    'Quantity': np.concatenate([
        np.random.randint(low=500, high=1001, size=800),
        np.random.randint(1, 501, 700)
    ])
}
data = pd.DataFrame(data)
scaler = StandardScaler()
data[['Rank', 'Amount', 'Quantity']] = scaler.fit_transform(data[['Rank', 'Amount', 'Quantity']])

# 人工设定的相似矩阵
similarity_matrix = {
    '7Up': {'7Up': 0, 'Sprite': 1, 'Pepsi': 1, 'Coke': 1, 'Cappuccino': 2, 'Espresso': 2, 'Latte': 2},
    'Sprite': {'7Up': 1, 'Sprite': 0, 'Pepsi': 1, 'Coke': 1, 'Cappuccino': 2, 'Espresso': 2, 'Latte': 2},
    'Pepsi': {'7Up': 1, 'Sprite': 1, 'Pepsi': 0, 'Coke': 1, 'Cappuccino': 2, 'Espresso': 2, 'Latte': 2},
    'Coke': {'7Up': 1, 'Sprite': 1, 'Pepsi': 1, 'Coke': 0, 'Cappuccino': 2, 'Espresso': 2, 'Latte': 2},
    'Cappuccino': {'7Up': 2, 'Sprite': 2, 'Pepsi': 2, 'Coke': 2, 'Cappuccino': 0, 'Espresso': 1, 'Latte': 1},
    'Espresso': {'7Up': 2, 'Sprite': 2, 'Pepsi': 2, 'Coke': 2, 'Cappuccino': 1, 'Espresso': 0, 'Latte': 1},
    'Latte': {'7Up': 2, 'Sprite': 2, 'Pepsi': 2, 'Coke': 2, 'Cappuccino': 1, 'Espresso': 1, 'Latte': 0}
}

# 将相似度矩阵转换为DataFrame
similarity_df = pd.DataFrame(similarity_matrix)

# 将相似度矩阵应用于数据
drinks = data['Drink'].unique()
similarity_values = data['Drink'].apply(lambda x: similarity_df.loc[x].values)
similarity_df_expanded = pd.DataFrame(similarity_values.tolist(), columns=[f'Similarity_{drink}' for drink in drinks])

# 将展开后的相似度特征加入到数据中
data_with_similarity = pd.concat([data.drop(columns=['Drink']), similarity_df_expanded], axis=1)

# t-SNE降维
tsne = TSNE(n_components=2, random_state=45)
tsne_results = tsne.fit_transform(data_with_similarity.drop(columns=['Class']))
data_with_similarity['tsne-x'] = tsne_results[:, 0]
data_with_similarity['tsne-y'] = tsne_results[:, 1]

# 创建Dash应用
app = Dash(__name__)

app.layout = html.Div([
    dcc.Tabs([
        dcc.Tab(label='t-SNE with Similarity Matrix', children=[
            dcc.Graph(
                id='scatter-plot-similarity',
                figure=px.scatter(data_with_similarity, x='tsne-x', y='tsne-y', color='Class', hover_data=['Class', 'Rank', 'Amount', 'Quantity'])
            ),
            html.Div(id='output-similarity')
        ])
    ])
])

@app.callback(
    Output('output-similarity', 'children'),
    Input('scatter-plot-similarity', 'selectedData')
)
def display_selected_data_similarity(selectedData):
    if selectedData is None:
        return "No points selected."
    points = selectedData['points']
    indices = [point['pointIndex'] for point in points]
    selected_df = data_with_similarity.iloc[indices]
    return html.Div([
        html.H4('Selected Points'),
        dcc.Markdown(f'```{selected_df.to_string()}```')
    ])

if __name__ == '__main__':
    app.run_server(debug=True)
