# What's For Today

## 1.1 Data Import

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import jieba
import simplejson
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from string import ascii_letters
import numpy as np

from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, f1_score

from clean import split_score,split_review,run,recommendation
from nlp import nlp_score,review_clean,score_trans,pred_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
%%time
d_info = pd.read_csv("raw_data/dianping_info_test3.csv",encoding='utf-8')
d_review = pd.read_csv("raw_data/dianping_review_v3.csv",encoding='utf-8')

df = run("dianping_info_test3.csv","dianping_review_v3.csv")

0.8926637082933747
CPU times: user 1min 39s, sys: 410 ms, total: 1min 39s
Wall time: 1min 40s


In [16]:
df.to_csv('raw_data/cleaned_data.csv')

## 1.2 Data Analysis

### 1.2.1 Review Score Histplot

In [None]:
plt.figure(figsize=(20, 4))

plt.subplot(1, 4, 1)
plt.title('Total Score')
sns.histplot(df['total_review_score'], kde=True,bins=10)

plt.subplot(1, 4, 2)
plt.title('Taste Score')
sns.histplot(df['taste'], kde=True,bins=10, color='orange')

plt.subplot(1, 4, 3)
plt.title('Surroundings Score')
sns.histplot(df['surroundings'], kde=True, color='r',bins=10)

plt.subplot(1, 4, 4)
plt.title('Service Score')
sns.histplot(df['service'], kde=True, color='g',bins=10);

### 1.2.2 Features Correlation

In [None]:
df_analysis = df.drop(columns=['restaurant_id','restaurant','address','tag1','tag2'])

In [None]:
scaler = StandardScaler()
scaler.fit(df_analysis)
df_transformed = pd.DataFrame(scaler.transform(df_analysis), columns = df_analysis.columns)

In [None]:
corrMatrix = round(df_transformed.corr(),2)
plt.figure(figsize=(14, 10))
plt.title("Features Correlation",fontsize=20)
mask = np.triu(np.ones_like(corrMatrix, dtype=bool))
cmap = sns.color_palette("mako", as_cmap=True)
sns.heatmap(corrMatrix, annot=True, cmap=cmap, mask=mask)
plt.show();

In [None]:
df_wc = review_clean(d_review)
df_wc['target'] = df_wc['用户总分'].map(lambda x:score_trans(x))

In [None]:
def extract_keywords(text, mode = "TF-IDF", topK = 30, allowPOS = ["n", "v"]):
    keyword_ls = []
    if mode == "TF-IDF":
        for key in analyse.extract_tags(text, topK = topK, withWeight = False, allowPOS = allowPOS):
            keyword_ls.append(key)
        
        keyword_df = pd.DataFrame(keyword_ls, columns = ["关键词"])
        return keyword_df
    
    else:
        for key in analyse.textrank(text, topK = topK, withWeight = False, allowPOS = allowPOS):
            keyword_ls.append(key)
        
        keyword_df = pd.DataFrame(keyword_ls, columns = ["关键词"])
        return keyword_df

In [None]:
def extract_most_frequent(text, mode = "normal", topK = 30):
    stopwords_ls = open("raw_data/cn_stopwords.txt", encoding = "utf-8").read().splitlines()
    
    if mode == "normal":
        words = jieba.lcut(text)
    else:
        words = jieba.lcut_for_search(text)
    
    words = [word for word in words if word not in stopwords_ls]
        
    keyword_ls = {}
    for keyword in words:
        if len(keyword) < 2:
            continue
        if keyword in keyword_ls:
            keyword_ls[keyword] += 1
        else:
            keyword_ls[keyword] = 1
            
    keyword_df = pd.DataFrame(sorted(keyword_ls.items(), key = lambda x: x[1], reverse = True), columns = ["高频词", "词频"])[0:topK]
    return keyword_df

In [None]:
text_negative = " ".join(df_wc[df_wc['店铺id'] == "l1KkGwL9nhdbxYJG"]["评论内容"])

In [18]:
recommendation(district = '全部', cuisine = '本帮江浙菜')

[{'price_per_person': 130,
  'restaurant_id': 'G5G3IjgmSzR5K3Sq',
  'restaurant': '侬家宴·本帮江浙小海鲜(西康店)',
  'address': '活\uf5d7\uf286\uf867\uf5bc\uf5bc\ue2ef',
  'total_review_score': 4.8,
  'tag1': '上海本帮菜',
  'tag2': '南京西路',
  'taste': 4.8,
  'surroundings': 4.8,
  'service': 4.8,
  'medium_review_number': 130,
  'good_review_number': 1655,
  'bad_review_number': 42,
  'total_review_number': 1827,
  'no_queues': 5.0,
  'nlp_score': 0.8685666666666666,
  'cuisine': '本帮江浙菜',
  'district': '静安区',
  'no_queues%': 0.0027367268746579,
  'rank': 1},
 {'price_per_person': 129,
  'restaurant_id': 'H1HuoCFIrRX3Cra1',
  'restaurant': '名厨本帮馆',
  'address': '汝\uf6fe\uedbb11\ueab7\ue2ef',
  'total_review_score': 4.5,
  'tag1': '上海本帮菜',
  'tag2': '西藏南路/世博会馆',
  'taste': 4.6,
  'surroundings': 4.0,
  'service': 4.1,
  'medium_review_number': 364,
  'good_review_number': 3115,
  'bad_review_number': 203,
  'total_review_number': 3682,
  'no_queues': 21.0,
  'nlp_score': 0.8377333333333333,
  'cuisine': '本