In [1]:
import tensorflow as tf
import random
import os
import sys
import numpy as np
import torch
import logging
import argparse
import shutil
import cv2
import time
import threading
import multiprocessing
from pathlib import Path
from tqdm import tqdm
from matplotlib import pyplot as plt
from icecream import ic
from collections import defaultdict
import re

2022-02-16 01:29:55.319824: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
sentense = [
    'i love my dog',
    'i love my cat',
    'i hate my dog',
    'do you think my dog is amazing'
]
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentense)
tokenizer.word_index

{'my': 1,
 'i': 2,
 'dog': 3,
 'love': 4,
 'cat': 5,
 'hate': 6,
 'do': 7,
 'you': 8,
 'think': 9,
 'is': 10,
 'amazing': 11}

In [12]:
sentense_seq = tokenizer.texts_to_sequences(sentense)

In [15]:
paded_sequence = pad_sequences(sentense_seq, padding='post',maxlen=10)
paded_sequence

array([[ 2,  4,  1,  3,  0,  0,  0,  0,  0,  0],
       [ 2,  4,  1,  5,  0,  0,  0,  0,  0,  0],
       [ 2,  6,  1,  3,  0,  0,  0,  0,  0,  0],
       [ 7,  8,  9,  1,  3, 10, 11,  0,  0,  0]], dtype=int32)

In [16]:
paded_sequence = pad_sequences(sentense_seq, padding='post',maxlen=6)
paded_sequence

array([[ 2,  4,  1,  3,  0,  0],
       [ 2,  4,  1,  5,  0,  0],
       [ 2,  6,  1,  3,  0,  0],
       [ 8,  9,  1,  3, 10, 11]], dtype=int32)

In [17]:
paded_sequence = pad_sequences(sentense_seq, padding='post',maxlen=6, truncating='post')
paded_sequence

array([[ 2,  4,  1,  3,  0,  0],
       [ 2,  4,  1,  5,  0,  0],
       [ 2,  6,  1,  3,  0,  0],
       [ 7,  8,  9,  1,  3, 10]], dtype=int32)

In [18]:
paded_sequence = pad_sequences(sentense_seq, padding='post',maxlen=6, truncating='post', value=-1)
paded_sequence

array([[ 2,  4,  1,  3, -1, -1],
       [ 2,  4,  1,  5, -1, -1],
       [ 2,  6,  1,  3, -1, -1],
       [ 7,  8,  9,  1,  3, 10]], dtype=int32)

### stopwords

In [19]:
def get_stopwords_list(stop_path):
    with open(stop_path, encoding='utf-8') as f:
        lines = f.readlines()
    return [line.strip() for line in lines]

In [30]:
stopwords_list = get_stopwords_list("english")

In [21]:
get_stopwords_list("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [22]:
def move_stopwords(sentence_list, stopwords_list):
    out_list = []
#     stopwords_list.append('\t')
    for word in sentence_list:
        if word in stopwords_list:continue
        out_list.append(word)
    return out_list

In [28]:
tokenizer.word_index.keys()
# tokenizer.word_docs

dict_keys(['my', 'i', 'dog', 'love', 'cat', 'hate', 'do', 'you', 'think', 'is', 'amazing'])

In [32]:
move_stopwords(tokenizer.word_index.keys(), stopwords_list)

['dog', 'love', 'cat', 'hate', 'think', 'amazing']

#### nltk

In [35]:
# !pip install nltk -i https://pypi.tuna.tsinghua.edu.cn/simple

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting nltk
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/43/0b/8298798bc5a9a007b7cae3f846a3d9a325953e0f9c238affa478b4d59324/nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Collecting regex>=2021.8.3
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/82/b9/09143a2072af5571227f1687e44fd9041cc5933fffaf2fbc30394c720141/regex-2022.1.18-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (748 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m749.0/749.0 KB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: regex, nltk
Successfully installed nltk-3.7 regex-2022.1.18
[0m

In [36]:
from nltk.corpus import stopwords

In [34]:
# !which python

/opt/conda/bin/python


In [38]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [39]:
import string

In [42]:
punctuation = string.punctuation
punctuation.split()

['!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']

In [45]:
stop = set(stopwords_list)
stop.update(set(punctuation))
list(stop)

['hasn',
 'or',
 'where',
 ')',
 "you'll",
 'whom',
 'that',
 'shan',
 '#',
 "that'll",
 ',',
 'y',
 'it',
 'once',
 'were',
 'haven',
 ';',
 'how',
 '\\',
 'against',
 'there',
 'because',
 'don',
 'up',
 "don't",
 'ours',
 'i',
 'here',
 "weren't",
 'his',
 'does',
 '%',
 'through',
 ']',
 "shan't",
 "didn't",
 'they',
 'ain',
 'll',
 'other',
 'to',
 'very',
 'before',
 'being',
 'won',
 'he',
 'about',
 '`',
 'further',
 '|',
 'did',
 'those',
 'who',
 'an',
 'when',
 'each',
 "'",
 'no',
 'doesn',
 "haven't",
 'which',
 'then',
 'are',
 'hadn',
 'on',
 'shouldn',
 'both',
 "wouldn't",
 '!',
 'can',
 's',
 'wasn',
 'most',
 "mustn't",
 'as',
 'do',
 'mustn',
 'itself',
 'weren',
 'wouldn',
 '}',
 'themselves',
 '$',
 "it's",
 've',
 'out',
 'o',
 'down',
 'all',
 'of',
 'our',
 'yours',
 '/',
 'own',
 'me',
 'under',
 'ma',
 'what',
 'them',
 'had',
 'now',
 'again',
 '@',
 'in',
 'with',
 '(',
 'below',
 'she',
 'myself',
 '=',
 "wasn't",
 'its',
 'why',
 "needn't",
 'any',
 '[',


#### jieba

In [47]:
# !pip install jieba -i https://pypi.tuna.tsinghua.edu.cn/simple

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting jieba
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c6/cb/18eeb235f833b726522d7ebed54f2278ce28ba9438e3135ab0278d9792a2/jieba-0.42.1.tar.gz (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: jieba
  Building wheel for jieba (setup.py) ... [?25ldone
[?25h  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314477 sha256=4fb5ff6a8669450ed170677df09bd8a339ad05c2d3d5a1e12c368811e492c015
  Stored in directory: /root/.cache/pip/wheels/95/1a/6d/75355e7a5c76ed48e2d6cde3b95c4828e83274b93f5392ac96
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1
[0m

In [48]:
import jieba


In [50]:
sentense2 = '中国是最强大的'
s = jieba.cut(sentense2)
list(s)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.715 seconds.
Prefix dict has been built successfully.


['中国', '是', '最', '强大', '的']

http://aimaksen.bslience.cn/bbc-text.csv

In [54]:
%ls

[0m[01;34mSaves[0m/    bbc-text.csv  test01.ipynb  test03.ipynb  test05.ipynb  test9.ipynb
[01;34mSources[0m/  english       test02.ipynb  test04.ipynb  test08.ipynb


In [52]:
# !cp /tmp/bbc-text.csv .
