In [None]:
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import io
import glob
import math
import os
import sys
from collections import Counter
from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.externals.six import StringIO  
from IPython.display import Image
from numpy import genfromtxt
from six import StringIO
from sklearn import metrics, preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz

import gensim
import jieba
import jieba.analyse
import pydotplus
import seaborn as sns

import random
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.serif'] = ['SimHei']
sns.set_style("darkgrid",{"font.sans-serif":['simhei', 'Arial']})

In [None]:
# get all available file names
folder = "data/standardized_data"
files = glob.glob(f"{folder}/*")
file_with_over_800_lines = []
for f in files:
    filename = os.path.basename(f)
    num_lines = sum(1 for line in open(f))
    if num_lines > 800:
        file_with_over_800_lines.append(filename)

# print(file_with_over_800_lines)
# remove some of inappropriate query words
file_with_over_800_lines.remove('28_2019-05-23_rank_美妆_value.csv')
file_with_over_800_lines.remove('1_2019-05-23_rank_刀片_value.csv')
file_with_over_800_lines.remove('10_2019-05-23_rank_指甲剪套装_value.csv')
file_with_over_800_lines.remove('34_2019-05-23_rank_工具_value.csv')
file_with_over_800_lines.remove('33_2019-05-23_rank_美妆工具_value.csv')
file_with_over_800_lines.remove('18_2019-05-23_rank_化妆品_value.csv')
file_with_over_800_lines.remove('27_2019-05-23_rank_彩妆_value.csv')
file_with_over_800_lines

In [None]:
results = ['accuracy', 'precision', 'recall']
headers = [
    '价格数值', 
    '销量数值', 
    '好评数值', 
    '差评数值', 
    '有图评论数值',
    '类目ID数值',
    '下架时间数值',
    '免运费数值',
    '新品数值',
    '公益宝贝数值',
    '淘金币数值',
    '信誉等级数值',
    '店铺服务数值'
]
cols = ['关键词'] + results + headers
results = []

def normalize_columns_except(df, column):
    columns = list(df.columns)
    columns.remove('qid')
    columns.remove(column)
    
    mean = df.mean()
    copy = df.copy()
    for col in columns:
        copy[col] = mean[col]
        
    return copy

def row_prediction(y, column, new_value):
    y[0][X.columns.get_loc(column)] = new_value
    return model.predict(y)

def binary_search_turning_point(lo, hi, column):
    if hi - lo > 0.000001:
        lo_pred = row_prediction(y, column, lo)
        hi_pred = row_prediction(y, column, hi)
        
        mi = (lo + hi) / 2
        mi_pred = row_prediction(y, column, mi)
        
        if lo_pred == mi_pred:
            return binary_search_turning_point(mi, hi, column)
        else:
            return binary_search_turning_point(lo, mi, column)
    else:
        return lo

for filename in file_with_over_800_lines:
    file = f"{folder}/{filename}"
    
    df = pd.read_csv(file)
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    model = DecisionTreeClassifier(criterion = 'gini', random_state = 100)
    model = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    row = []
    row.append(filename)
    row.append(metrics.accuracy_score(y_test, y_pred))
    row.append(metrics.precision_score(y_test, y_pred))
    row.append(metrics.recall_score(y_test, y_pred))
    
    for col in headers:
        y = X.mean().values.reshape((1, -1))
        row.append(binary_search_turning_point(0, 1, col))
    
    results.append(row)
    
result = pd.DataFrame(results, columns = cols)
result.to_csv('single_query_turning_point_analysis.csv', index = False, encoding='utf-8-sig')
result

# Relationship between query and feature

In [None]:
folder = "data/standardized_data"
queries = []
columns = ['qid', '商品名称数值'] + headers + ['排行数值']
for filename in file_with_over_800_lines:
    file = f"{folder}/{filename}"
    
    df = pd.read_csv(file)
    corr = df.corr()
    queries.append(corr.iloc[-1, :].values)

file = "data/aggregated_data/combined_data.csv"
    
df = pd.read_csv(file)
queries.append(df.corr().iloc[-1, :].values)

index = file_with_over_800_lines + ['combined_data.csv']
df = pd.DataFrame(queries, index = index, columns = columns)
df = df.iloc[:, 1:-1]
colormap = sns.diverging_palette(220, 10, as_cmap=True)
fig, ax = plt.subplots(figsize=(10, 10))
#Generate Heat Map, allow annotations and place floats in map
sns.heatmap(df, cmap=colormap, annot=True, fmt=".2f")
#Apply xticks
plt.xticks(range(len(columns[1:-1])), columns[1:-1]);
#Apply yticks
plt.yticks(range(len(index)), index)
#show plot
plt.show()