<a href="https://colab.research.google.com/github/zeffon/NLPHelp/blob/master/%E2%80%9CTensorFlow_with_GPU%E2%80%9D%E7%9A%84%E5%89%AF%E6%9C%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Confirm TensorFlow can see the GPU

Simply select "GPU" in the Accelerator drop-down in Notebook Settings (either through the Edit menu or the command palette at cmd/ctrl-shift-P).

In [0]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

SystemError: ignored

# Observe TensorFlow speedup on GPU relative to CPU

This example constructs a typical convolutional neural network layer over a
random image and manually places the resulting ops on either the CPU or the GPU
to compare execution speed.

In [0]:
import tensorflow as tf
import timeit

# See https://www.tensorflow.org/tutorials/using_gpu#allowing_gpu_memory_growth
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

with tf.device('/cpu:0'):
  random_image_cpu = tf.random_normal((100, 100, 100, 3))
  net_cpu = tf.layers.conv2d(random_image_cpu, 32, 7)
  net_cpu = tf.reduce_sum(net_cpu)

with tf.device('/gpu:0'):
  random_image_gpu = tf.random_normal((100, 100, 100, 3))
  net_gpu = tf.layers.conv2d(random_image_gpu, 32, 7)
  net_gpu = tf.reduce_sum(net_gpu)

sess = tf.Session(config=config)

# Test execution once to detect errors early.
try:
  sess.run(tf.global_variables_initializer())
except tf.errors.InvalidArgumentError:
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise

def cpu():
  sess.run(net_cpu)
  
def gpu():
  sess.run(net_gpu)
  
# Runs the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

sess.close()

In [0]:
!pip --version

In [0]:
!pip install tensorboardcolab



In [0]:
import os
import re
import codecs

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

import jieba
import jieba.analyse  

In [0]:
import tensorflow as tf
from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback

In [0]:
from tensorflow import keras
from keras import regularizers
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Flatten, Dropout, GlobalAveragePooling1D

Using TensorFlow backend.


In [0]:
raw_root_path = '.'
STOPWORDS_PATH = './stopwords0.txt'
raw_train_data_path = 'comments_train.csv'
raw_test_data_path = 'comments_test.csv'
train_data_path = os.path.join(raw_root_path, raw_train_data_path)
test_data_path = os.path.join(raw_root_path, raw_test_data_path)
df_train_data = pd.read_csv(train_data_path, encoding='utf-8').sample(frac=1).iloc[:6000, :]
df_test_data = pd.read_csv(test_data_path, encoding='utf-8')

In [0]:
df_train_data.head()

Unnamed: 0,id,comment,label
1828,1828,为什么界面总是加载不出来哇哇哇！！！,0
2027,2027,越来越有意思了。虽然反反复复的看，跟着做印象深。,2
2671,2671,全新的学习体验，非常棒,2
5067,5067,不知道实验室还是要计时的，不知道是不是计时结束会有收费？,0
2026,2026,挺好的，不过现在是上班时间，等下班了再来学习,2


In [0]:
def text_to_wordlist(row):
    result = re.sub('[^\u4e00-\u9fa5?!]', "", row)
    f1_seg_list = jieba.cut(result)#需要添加一个词典，来弥补结巴分词中没有的词语，从而保证更高的正确率
    f_stop = codecs.open(STOPWORDS_PATH,"r","utf-8")
    try:
        f_stop_text = f_stop.read()
    finally:
        f_stop.close()
    f_stop_seg_list = f_stop_text.split()
    test_words = []
    for myword in f1_seg_list:
        test_words.append(myword)
#         if myword not in f_stop_seg_list:
#             test_words.append(myword)
    result = ' '.join(test_words)
    return result

def Textrank(content):
    result = re.sub(r'[^\u4e00-\u9fa5?!]', "",content)
    seg = jieba.cut(result)  
    jieba.analyse.set_stop_words(STOPWORDS_PATH)
    keyList=jieba.analyse.textrank('|'.join(seg), topK=10, withWeight=False)  
    return keyList
 
def TF_IDF(content):
    result = re.sub(r'[^\u4e00-\u9fa5?!]', "",content)
    seg = jieba.cut(result)  
    jieba.analyse.set_stop_words(STOPWORDS_PATH)
    keyWord = jieba.analyse.extract_tags(  
        '|'.join(seg), topK=20, withWeight=False, allowPOS=())#关键词提取，在这里对jieba的tfidf.py进行了修改   
    return keyWord
def convert_TF_IDF(content):
    result = TF_IDF(content)
    result = ' '.join(result)
    return result

In [0]:
train_y_data = df_train_data.label

In [0]:
train_data = df_train_data.comment.map(convert_TF_IDF)
train_data[:5]
type(train_data)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.066 seconds.
Prefix dict has been built succesfully.


pandas.core.series.Series

In [0]:
# vect = TfidfVectorizer(max_features=5000)
vect = TfidfVectorizer(min_df=6, max_features=800)
train_data_v = vect.fit_transform(train_data)
print('count features: {}'.format(len(vect.vocabulary_)))
# print(' features: {}'.format(vect.vocabulary_))
# print(repr(train_data_v.toarray()[0]))

count features: 800


In [0]:
train_x, val_x, train_y, val_y = train_test_split(train_data_v, train_y_data, test_size=0.2, random_state=20)

In [0]:
print('-----before reshape-------')
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)

n_words = vect.vocabulary_
max_features = n_words
batch_size = 120
timesteps = 4

train_x_r = train_x.toarray().reshape([1200 * 4, timesteps, int(train_x.shape[1]/timesteps)])
train_y_r = keras.utils.to_categorical(train_y, num_classes=4)
val_x_r = val_x.toarray().reshape([1200 * 1, timesteps, int(val_x.shape[1]/timesteps)])
val_y_r = keras.utils.to_categorical(val_y, num_classes=4)

print('-----after reshape-------')
print(train_x_r.shape)
print(train_y_r.shape)
print(val_x_r.shape)
print(val_y_r.shape)

-----before reshape-------
(4800, 800)
(4800,)
(1200, 800)
(1200,)
-----after reshape-------
(4800, 4, 200)
(4800, 4)
(1200, 4, 200)
(1200, 4)


In [0]:
data_dim = train_x_r.shape[2]

tbc=TensorBoardColab()

model = Sequential()
# model.add(Embedding(len(n_words)+1, 32, input_length=data_dim))
# model.add(LSTM(32, return_sequences=True, stateful=True, ))
model.add(LSTM(32, return_sequences=True, stateful=True,
               batch_input_shape=(batch_size, timesteps, data_dim),
               kernel_regularizer=regularizers.l2(0.001), 
               recurrent_regularizer=regularizers.l2(0.01), 
               bias_regularizer=regularizers.l2(0.001),
              ))
# model.add(Dropout(rate=0.01, noise_shape=(batch_size, timesteps, 32), seed=1))
model.add(LSTM(32, return_sequences=True, stateful=True))
# model.add(Dropout(rate=0.01, noise_shape=(batch_size, timesteps, 32), seed=1))
model.add(LSTM(32, stateful=True))
# model.add(Dropout(rate=0.01, noise_shape=(batch_size, 32), seed=1))
model.add(Dense(4, activation='softmax'))

model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
              metrics=['categorical_accuracy'])

model.fit(train_x_r, train_y_r,
          batch_size=batch_size, 
          epochs=50, 
          shuffle=True,
          validation_data=(val_x_r, val_y_r),
          callbacks=[TensorBoardColabCallback(tbc)]
         )

score, acc = model.evaluate(val_x_r, val_y_r,
                            batch_size=batch_size)
print('score: {}'.format(score))
print('acc: {}'.format(acc))

Wait for 8 seconds...
