In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import pickle as pickle
%matplotlib inline

  from ._conv import register_converters as _register_converters


In [None]:
!nvidia-smi

## data

In [5]:
path = "/home/aida-zw/Desktop/ws/Data/sample/"
path_io = "/home/aida-zw/Desktop/ws/notebook/io/"
def read_data(csv_file, path):
    df = pd.read_csv(path + csv_file)
    df = df[['fact', 'accusation']]
    return df
train_data = read_data('data_train.csv', path)
test_data = read_data('data_test.csv', path)
train_data.shape， test_data

((154592, 2), (32508, 2))

In [6]:
min_count =100
df_tmp = train_data.groupby('accusation').filter(lambda x: len(x) > min_count).drop_duplicates(subset='accusation')
accusation_set = df_tmp.accusation.tolist()
accusation_set = [x for x in accusation_set if  ',' not in x]
train_data = train_data[train_data.accusation.isin(accusation_set)]
test_data = test_data[test_data.accusation.isin(accusation_set)]

train_data.shape, test_data.shape

((117709, 2), (29569, 2))

In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(train_data.accusation)

with open(path_io+'label_encoder_accu.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

train_data.accusation = label_encoder.transform(train_data.accusation)
test_data.accusation = label_encoder.transform(test_data.accusation)

train_data.shape, test_data.shape, label_encoder.classes_.shape

((117709, 2), (29569, 2), (122,))

## model

In [8]:
train_data.accusation = train_data.accusation.map(int)
test_data.accusation = test_data.accusation.map(int)
train_input_fn = tf.estimator.inputs.pandas_input_fn(train_data, train_data['accusation'], shuffle=True, num_epochs=None, batch_size=512)
test_input_fn = tf.estimator.inputs.pandas_input_fn(test_data, test_data['accusation'], shuffle=False, batch_size=128)

In [9]:
tf.gfile.DeleteRecursively(path_io +'model/')

In [10]:
n_classes = len(accusation_set)
feature_columns = [hub.text_embedding_column('fact','https://tfhub.dev/google/nnlm-zh-dim128-with-normalization/1', trainable=True)]
model = tf.estimator.DNNClassifier([512, 512, 128], 
                                   feature_columns=feature_columns, n_classes=n_classes, model_dir=path_io+'model/')
model.train(input_fn=train_input_fn, max_steps=3000)

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/aida-zw/Desktop/ws/notebook/io/model/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fec0c6e9898>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/fact_hub_module_embedding/module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/8a7d79f6fd9af6ea61fa3e11a09a79a613503992/variables/

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7fec0ce5b390>

In [None]:
%time test_predictions = list(model.predict(input_fn=test_input_fn))

## Prediction

In [58]:
class Predictor(object):
    def __init__(self, model_dir, label_encoder, n_classes, df_accu_dir):
        self.model_dir = model_dir
        self.df_accu = pd.read_csv(df_accu_dir)
        self.label_encoder = label_encoder
        self.n_classes = n_classes
        self.feature_columns = [hub.text_embedding_column('fact',hub_dir , trainable=True)]
        self.model = tf.estimator.DNNClassifier([512, 512, 128], 
                        feature_columns=self.feature_columns, n_classes=self.n_classes, model_dir=self.model_dir)
    
    def predict(self, content):

        test_data = pd.DataFrame({'fact': content})
        ans_accusation, _ = self.predict_accu(test_data)
        ans_articles = self.predict_law(test_data)
        ans_imprisonment = self.predict_time(test_data)
        #print (len(ans_accusation), len(ans_articles), len(ans_imprisonment))
        ans = pd.DataFrame({'accusation':ans_accusation, 'articles': ans_articles, 'imprisonment':ans_imprisonment})
        dict_ans = ans.to_dict(orient='records')
        return dict_ans
        
    def predict_law(self, test_data):
        law = []
        for i in range(test_data.shape[0]):
            law.append([7])
        return law

        
        
    def predict_time(self, test_data):
        time = []
        for i in range(test_data.shape[0]):
            time.append(7)
        return time
    
    
    def predict_accu(self, test_data):
        test_input_fn = tf.estimator.inputs.pandas_input_fn(
            test_data, 
            None, 
            shuffle=False, 
            batch_size=128)
        result = list(self.model.predict(input_fn=test_input_fn))
        result_class = np.array([p['classes'][0] for p in result])
        prediction_arr = label_encoder.inverse_transform(result_class.astype(int))
        accusation_num = self.get_accusation_num(self.df_accu, prediction_arr)
        
        return accusation_num, prediction_arr
    
        
    
    
    def get_accusation_num(self, df_accu, arr):
        keys = df_accu.accusation.tolist()
        keys = [x.strip() for x in keys]
        values = df_accu.accusation_num.tolist()
        dict_accu = dict(zip(keys, values))

        accusation_num = list()
        for accu in arr:
            accusation_num.append([dict_accu.get(x.replace("'", "").strip()) for x in accu.split(',')])
        return accusation_num


## if __name__ == "__main__":

In [59]:


n_classes = 122
model_dir = path_io + 'model/'
df_accu_dir = path_io + "accu.csv"
label_encoder_dir = path_io + 'label_encoder_accu.pickle'
hub_dir = path_io + 'google_nnlm-zh-dim128-with-normalization_1'


with open(label_encoder_dir, 'rb') as handle:
    label_encoder = pickle.load(handle)

content = test_data['fact'].tolist()    
    
model = Predictor(model_dir, label_encoder, n_classes, df_accu_dir)
result = model.predict(content)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/aida-zw/Desktop/ws/notebook/io/model/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fec0ce059b0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/fact_hub_module_embedding/module/embeddings/part_0:0 from checkpoint b'/home/aida-zw/Desktop/ws/notebook/io/google_nnlm-zh-dim128-with-normalization_1/variables/variables' with embeddings
INFO:tensorf

  if diff:


In [60]:
result

[{'accusation': [71], 'articles': [7], 'imprisonment': 7},
 {'accusation': [77], 'articles': [7], 'imprisonment': 7},
 {'accusation': [128], 'articles': [7], 'imprisonment': 7},
 {'accusation': [118], 'articles': [7], 'imprisonment': 7},
 {'accusation': [101], 'articles': [7], 'imprisonment': 7},
 {'accusation': [8], 'articles': [7], 'imprisonment': 7},
 {'accusation': [87], 'articles': [7], 'imprisonment': 7},
 {'accusation': [156], 'articles': [7], 'imprisonment': 7},
 {'accusation': [182], 'articles': [7], 'imprisonment': 7},
 {'accusation': [77], 'articles': [7], 'imprisonment': 7},
 {'accusation': [81], 'articles': [7], 'imprisonment': 7},
 {'accusation': [118], 'articles': [7], 'imprisonment': 7},
 {'accusation': [201], 'articles': [7], 'imprisonment': 7},
 {'accusation': [192], 'articles': [7], 'imprisonment': 7},
 {'accusation': [151], 'articles': [7], 'imprisonment': 7},
 {'accusation': [201], 'articles': [7], 'imprisonment': 7},
 {'accusation': [39], 'articles': [7], 'impriso

In [None]:
    def predict(self, content):
        result = []
        for a in range(0, len(content)):
            result.append({
                "accusation": [1, 2, 3],
                "imprisonment": 5,
                "articles": [5, 7, 9]
            })
        return result

In [None]:
yhat = np.array([p['classes'][0] for p in test_predictions])
y = test_data.accusation
cm = pd.DataFrame({'truth': label_encoder.inverse_transform(y.values.astype(int)), 
                   'prediction': label_encoder.inverse_transform(yhat.astype(int))})
pd.pivot_table(cm, index='truth', columns='prediction', aggfunc='size').iloc[5:10,5:10]

In [None]:
result[0]

In [None]:
yhat = np.array([p['classes'][0] for p in test_predictions])
y = test_data.accusation
cm = pd.DataFrame({'truth': label_encoder.inverse_transform(y.values.astype(int)), 
                   'prediction': label_encoder.inverse_transform(yhat.astype(int))})
pd.pivot_table(cm, index='truth', columns='prediction', aggfunc='size').iloc[5:10,5:10]

In [None]:
cm.to_csv("../output/result.csv", index=False)

In [None]:
class BaselineModel(object):
    def __init__(self):
        pass
    def predict(self, df):
        theft = df.fact.str.contains('盗')
        yhat = df.accusation.copy()
        yhat[theft] = 1
        yhat[~theft] = 0
        return yhat

In [None]:
base_model = BaselineModel()
yyhat = base_model.predict(test_data)

In [None]:
# y = test_data.accusation
# cm = pd.DataFrame({'truth': label_encoder.inverse_transform(y.values.astype(int)), 
#                    'prediction': label_encoder.inverse_transform(yyhat.astype(int))})
# pd.pivot_table(cm, index='truth', columns='prediction', aggfunc='size')

In [25]:
path_io = "/home/aida-zw/Desktop/ws/notebook/io/"