In [1]:
#A sentence and the removing character from the sentence
sentence = "****Hello World! I am Amit Chauhan****"
removing_character = "*"
#using strip function to remove star(*)
print(sentence.strip(removing_character))

str1 = "Happy"
str2 = "Home"
print(" Good ".join([str1, str2]))

Hello World! I am Amit Chauhan
Happy Good Home


In [8]:
# to use a regular expression, we need to import re
import re
sentence = "My computer gives a very good performance in a very short time."
string = "very"
str_match = re.search(string, sentence)
print(str_match) 
for word in re.finditer("very", sentence): 
  print(word.span())

<re.Match object; span=(20, 24), match='very'>
(20, 24)
(47, 51)


## 分词和词干

### 分词 Tokenization
- 句子在前缀，中缀，后缀和异常中分开
- nltk和spacy都提供了分词方法

In [1]:
import nltk
example_string = "I'm going to meet\ M.S. Dhoni."
words = nltk.word_tokenize(example_string)
words

['I', "'m", 'going', 'to', 'meet\\', 'M.S', '.', 'Dhoni', '.']

In [2]:
import spacy
#Loading spacy english library
load_en = spacy.load('en_core_web_sm')
#take an example of string
example_string = "I'm going to meet\ M.S. Dhoni."
#load string to library 
words = load_en(example_string)
#getting tokens pieces with for loop
for tokens in words:
    print(tokens.text,end='\t')
words[2:6]


I，'m，going，to，meet\，M.S.，Dhoni，.，

going to meet\ M.S.

### 抽干 Stemming
- 将单词还原为词根或词干的含义的过程
- 去除单词的前后缀得到词根的过程，用于扩展检索，粒度较粗；Porter，「推荐」Snowball，Lancaster
- Spacy不包含词干提取器，需要使用NLTK库进行词干提取过程 

In [1]:
import nltk
#import porter stemmer from nltk
from nltk.stem.porter import PorterStemmer
pot_stem = PorterStemmer()
#random words to test porter stemmer
words = ['happy', 'happier', 'happiest', 'happiness', 'breathing', 'fairly']
for word in words:
    print(word + '----->' + pot_stem.stem(word))

happy----->happi
happier----->happier
happiest----->happiest
happiness----->happi
breathing----->breath
fairly----->fairli


In [2]:
#Snowball提取器用于更改进的方法
from nltk.stem.snowball import SnowballStemmer
snow_stem = SnowballStemmer(language='english')
for word in words:
    print(word + '----->' + snow_stem.stem(word))

happy----->happi
happier----->happier
happiest----->happiest
happiness----->happi
breathing----->breath
fairly----->fair


## 拔词和停用词

### 词性归并 Lemmatization 
- 基于词典将单词的复杂形态转变成最基础的形态，用于更细粒度、更为准确的文本分析和表达
- 词法化胜于词干，它能提供丰富的信息，这就是为什么spacy具有词法化而不是抽词干的原因
- nltk也可以Lemmatization

In [59]:
"""
__description__:词形还原Lemmatization
"""
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
 
wnl = WordNetLemmatizer()
# 获取单词的词性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# 分别定义需要进行还原的单词与相对应的词性
example_string = "I'm happy in this happiest place with all happiness. It feels how happier we are"
#load string to library 
words = nltk.word_tokenize(example_string)
for i in range(len(words)):
    print(wnl.lemmatize(words[i]),end='\t')
    # print(words[i]+'--'+get_wordnet_pos(pos_tag([words[i]])[0][1])+'-->'
    #       +wnl.lemmatize(words[i],get_wordnet_pos(pos_tag([words[i]])[0][1])))

I	'm	happy	in	this	happiest	place	with	all	happiness	.	It	feel	how	happier	we	are	

In [3]:
import spacy
#Loading spacy english library
load_en = spacy.load('en_core_web_sm')
#take an example of string
example_string = load_en(u"I'm happy in this happiest place with all happiness. It feels how happier we are")
for lem_word in example_string:
    print(lem_word.text, '\t', lem_word.pos_, '\t', lem_word.lemma, '\t', lem_word.lemma_)

I 	 PRON 	 4690420944186131903 	 I
'm 	 AUX 	 10382539506755952630 	 be
happy 	 ADJ 	 244022080605231780 	 happy
in 	 ADP 	 3002984154512732771 	 in
this 	 DET 	 1995909169258310477 	 this
happiest 	 ADJ 	 244022080605231780 	 happy
place 	 NOUN 	 7512738811199700769 	 place
with 	 ADP 	 12510949447758279278 	 with
all 	 DET 	 13409319323822384369 	 all
happiness 	 NOUN 	 2779265004918961325 	 happiness
. 	 PUNCT 	 12646065887601541794 	 .
It 	 PRON 	 10239237003504588839 	 it
feels 	 VERB 	 5741770584995928333 	 feel
how 	 SCONJ 	 16331095434822636218 	 how
happier 	 ADJ 	 244022080605231780 	 happy
we 	 PRON 	 16064069575701507746 	 we
are 	 AUX 	 10382539506755952630 	 be


### 停止词
- Spacy中，有一些停用词的内置列表

In [1]:
import spacy
#Loading spacy english library
load_en = spacy.load('en_core_web_sm')
print(load_en.Defaults.stop_words)

{'being', 'must', 'after', 'some', 'have', 'be', 'both', 'though', 'along', 'could', 'thereby', 're', 'hereupon', 'nobody', 'within', 'forty', 'still', 'thus', 'ten', 'everyone', 'third', 'again', 'whatever', 'nor', 'here', 'herself', 'her', 'each', 'would', 'ca', 'none', 'whoever', 'keep', 'therefore', 'that', 'former', 'between', 'does', '’ve', 'moreover', "'d", 'about', 'on', "n't", 'except', 'already', 'mostly', 'out', 'either', 'are', 'this', 'whether', 'toward', 'alone', 'am', 'throughout', 'anything', 'thereafter', 'take', 'than', 'thereupon', 'say', 'these', 'it', 'via', 'nevertheless', 'they', 'eleven', 'becomes', 'someone', 'nine', 'top', 'every', 'which', 'fifty', 'doing', 'therein', 'only', 'whither', '‘re', 'your', 'indeed', 'without', 'eight', 'never', 'such', 'six', 'in', 'i', 'can', 'who', 'everything', 'by', 'enough', 'often', 'otherwise', 'whole', 'even', 'had', 'he', 'below', 'ourselves', 'else', 'full', 'n’t', 'more', 'whereby', 'how', 'make', 'then', 'we', 'hundred

## 词性和命名实体识别

### 词性（POS）
- 获取有关文本和单词的信息作为标记的过程
- 存在动词粗标签和细粒度标签

In [22]:
import spacy
#Loading spacy english library
load_en = spacy.load('en_core_web_sm')
str1 = load_en(u"This laptop belongs to Amit Chauhan")
#pos_ tag operation 
print(str1[1].pos_)
#to know fine grained information
print(str1[1].tag_)
# 带有伪造的POS计数
pos_count = str1.count_by(spacy.attrs.POS)
print(pos_count)
str1.vocab[85].text

NOUN
NN
{90: 1, 92: 1, 100: 1, 85: 1, 96: 2}


'ADP'

### 命名实体识别（NER）
- NER可以标记文本的实体类型

In [2]:
import spacy
#Loading spacy english library
load_en = spacy.load('en_core_web_sm')
#lets label the entity in the text file
file = load_en(u" My girlFriend is living in India, Studying in IIT")
doc = file
if doc.ents:
    for ner in doc.ents:
        print(ner.text + ' - '+ ner.label_ + ' - ' + str(spacy.explain(ner.label_)))
else:
    print('No Entity Found')

India - GPE - Countries, cities, states
Studying - GPE - Countries, cities, states
IIT - ORG - Companies, agencies, institutions, etc.


## 数据清理与规范化
- 深度学习中可能不需要该步骤
- 常见预处理步骤：删除标点符号、表情符号、停用词、转小写、词干、提取主题等

In [1]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.porter import PorterStemmer
import emoji
import string

def preprocess_text(text, remove_stop = True, stem_words = False, remove_mentions_hashtags = True):
    """
    eg:
    input: preprocess_text("@water #dream hi hello where are you going be there tomorrow happening happen happens",  
    stem_words = True) 
    output: ['tomorrow', 'happen', 'go', 'hello']
    """
    # Remove emojis
    emoji_pattern = re.compile("[" "\U0001F1E0-\U0001F6FF" "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r"", text)
    text = "".join([x for x in text if x not in emoji.UNICODE_EMOJI])

    if remove_mentions_hashtags:
        text = re.sub(r"@(\w+)", " ", text)
        text = re.sub(r"#(\w+)", " ", text)
        
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower()) #lower-case
    words = (''.join(nopunct)).split()
    
    if(remove_stop):
        words = [w for w in words if w not in ENGLISH_STOP_WORDS]
        words = [w for w in words if len(w) > 2]  # remove a,an,of etc.
        
    if(stem_words):
        stemmer = PorterStemmer()
        words = [stemmer.stem(w) for w in words]
        
    return list(words)

## 词表示(word representation)

### 传统文本表示方法（离散式）
**One-hot**  
将每一个*单词*使用一个离散的向量表示
首先对所有句子的字进行索引，即将每个字确定一个编号：
```python
句子1：我 爱 北 京 天 安 门
句子2：我 喜 欢 上 海
{'我': 1, '爱': 2, '北': 3, '京': 4, '天': 5,
'安': 6, '门': 7, '喜': 8, '欢': 9, '上': 10, '海': 11}
```
在这里共包括11个字，因此每个字可以转换为一个11维度稀疏向量：
```
我：[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
爱：[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
...
海：[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
```

**Bag of Words**  
Bag of Words（词袋表示），也称为`Count Vectors`，每个文档的字/词可以使用其*出现次数*来进行表示
直接统计每个字出现的次数，并进行赋值：
```python
句子1：我 爱 北 京 天 安 门
转换为 [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]

句子2：我 喜 欢 上 海
转换为 [1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
```
在sklearn中用`CountVectorizer`实现这一步骤

**N-gram**  
N-gram与Count Vectors类似，不过加入了相邻单词组合成为新的单词，并进行计数。
如果N取值为2，则句子1和句子2就变为：
```
句子1：我爱 爱北 北京 京天 天安 安门
句子2：我喜 喜欢 欢上 上海
```

 **TF-IDF**  
 TF-IDF 分数由两部分组成：第一部分是*词语频率*（Term Frequency），第二部分是*逆文档频率*（Inverse Document Frequency）
 
 $TF=\frac{该词语在当前文档出现的次数}{当前文档中词语的总数}\\
 IDF=\log{(\frac{文档总数}{出现该词语的文档总数})}\\
 TF-IDF=TF*IDF$

 在sklearn中用`TfidfVectorizer`实现这一步骤


In [3]:
corpus = 'Today is a good day,tomorrow is going to be a good day'
vocab = preprocess_text(corpus)
vocab
# ‘Tomorrow will be a good day’的一天的文本可以编码为：[0,1,1,0,1,1,0,1,0]

['today', 'good', 'day', 'tomorrow', 'going', 'good', 'day']

In [3]:
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# 假设我们有以下的中文文档集合
documents = [
    "今天天气不错，我们去郊游吧。",
    "今天天气不好，我们去看电影吧。",
    "今天天气真不错，我们去公园散步吧。",
    "今天天气真不错，我们去海边玩吧。"
]

# 使用jieba进行中文分词
def chinese_tokenizer(text):
    return jieba.cut(text)

# 初始化CountVectorizer，使用jieba分词
vectorizer = CountVectorizer(tokenizer=chinese_tokenizer)

# 计算词频（Term Frequency）
X_counts = vectorizer.fit_transform(documents)

# 初始化TfidfVectorizer，使用jieba分词
tfidf_vectorizer = TfidfVectorizer(tokenizer=chinese_tokenizer)

# 计算TF-IDF权重
X_tfidf = tfidf_vectorizer.fit_transform(documents)

# 打印词频矩阵
print("词频矩阵：\n", X_counts.toarray())

# 打印TF-IDF矩阵
print("TF-IDF矩阵：\n", X_tfidf.toarray())

词频矩阵：
 [[1 0 1 1 0 1 1 1 0 0 0 0 0 0 1 1]
 [1 1 0 1 0 1 0 1 0 0 0 1 1 0 0 1]
 [1 0 0 1 1 1 1 1 1 0 0 0 0 1 0 1]
 [1 0 0 1 0 1 0 1 0 1 1 0 0 1 0 1]]
TF-IDF矩阵：
 [[0.26147089 0.         0.50105424 0.26147089 0.         0.26147089
  0.39503692 0.26147089 0.         0.         0.         0.
  0.         0.         0.50105424 0.26147089]
 [0.24987111 0.47882569 0.         0.24987111 0.         0.24987111
  0.         0.24987111 0.         0.         0.         0.47882569
  0.47882569 0.         0.         0.24987111]
 [0.24318358 0.         0.         0.24318358 0.46601044 0.24318358
  0.36740799 0.24318358 0.46601044 0.         0.         0.
  0.         0.36740799 0.         0.24318358]
 [0.26147089 0.         0.         0.26147089 0.         0.26147089
  0.         0.26147089 0.         0.50105424 0.50105424 0.
  0.         0.39503692 0.         0.26147089]]


### 词嵌入(Word Embedding)
**FastText**
- 通过Embedding层将单词映射到稠密空间，然后将句子中所有的单词在Embedding空间中进行平均
- FastText用单词的Embedding叠加获得的文档向量，将相似的句子分为一类
- FastText学习到的Embedding空间维度比较低，可以快速进行训练

**Word2vec**
- 使用浅层两层神经网络执行特定任务
- CBOW（连续词袋）：根据源上下文单词（环绕单词）预测当前的目标单词（中心单词）
- Skip—Gram：在给定目标单词（中心单词）的情况下预测源上下文单词（环绕单词）
  
**Glove** 
- 与Word2vec不同，Glove利用了单词的全局共现，而不仅仅是局部上下文
- 有许多在大规模语料库经过预训练的，具有不同向量长度的预训练词向量，例如Glove，fasttext等

**NNLM**
- 利用n-1个单词来预测第n个单词

In [None]:
### word2vec
# 使用词向量相似度查找与给定文档最相似的文档
import numpy as np

#loading the glove file into a dictionary of words
def load_glove(filename):
    glove_dict = {}
    with open(filename) as f:
        file_content = f.readlines()
    for line in file_content:
        line_content = line.split()
        glove_dict[line_content[0]] = np.array(line_content[1:], dtype=float)
    return glove_dict

#get centroid of a particular document
def get_centroid(text, gloves):
    words_list = preprocess_text(text)
    word_vec_sum = 0
    words_count = 0
    for w in words_list:
        if w in gloves:
            word_vec_sum += gloves[w]
            words_count += 1
    if words_count:
        return word_vec_sum/words_count
    else:
        return 0

#get distance between two centroids
def get_distance (a,b):
    return (np.linalg.norm(a - b))

In [24]:
#  训练从头开始生成单词向量
import gensim
words_list = []
with open('../data/potato.txt') as f:
    for text in f:
        words_list.append(preprocess_text(text))
model = gensim.models.Word2Vec(
        words_list,
        vector_size=150,
        window=2,
        min_count=1,
        workers=10,
        epochs=10)
print('vocabulary: ', model.wv.index_to_key,'\n')
print(model.wv.most_similar('starchy'),'\n')
print(model.wv.get_vector('potato'),'\n')
len(model.wv.index_to_key)


vocabulary:  ['potato', 'century', 'potatoes', 'plant', 'irish', 'crop', 'major', 'crops', 'end', 'tubers', 'europe', 'baking', 'flour', 'used', 'thickener', 'vegetable', 'sauces', 'highly', 'digestible', 'supply', 'vitamin', 'protein', 'thiamin', 'niacin', 'thought', 'ground', 'served', 'cooked', 'edible', 'solanum', 'tuberosum', 'annual', 'nightshade', 'family', 'solanaceae', 'grown', 'starchy', 'native', 'mashed', 'peruvian', 'bolivian', 'andes', 'world', 'main', 'food', 'frequently', 'domesticated', 'independently', 'dependence', 'times', 'mid', 'western', 'eastern', 'hemispheres', 'decades', 'economy', 'dependent', 'disastrous', 'failures', 'especially', 'largely', 'late', 'blight', 'phytophthora', 'infestans', 'resulting', 'famine', 'generated', 'cautious', 'spread', 'continued', 'england', 'west', 'cultivated', 'attitude', 'america', 'incas', 'early', 'years', 'ago', 'encountered', 'invading', 'spaniards', 'introduced', 'second', 'half', 'ireland', 'continental', 'particularly',

91

In [None]:
# Word2Vec模型的保存和加载
from gensim.models import Word2Vec
import jieba

# 示例文本数据
sentences = ["月之暗面科技有限公司是一家位于中国的人工智能公司。", "人工智能正在改变世界。"]

# 分词
sentences = [list(jieba.cut(sentence)) for sentence in sentences]

# 训练Word2Vec模型
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# 检查词汇表并获取向量
word = '人工智能'
if word in model.wv:
    vector = model.wv[word]
    print("向量:", vector)
else:
    print(f"词 '{word}' 不在词汇表中。")



In [3]:
### NNLM示例代码：https://www.cnblogs.com/jyroy/p/14726894.html
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
from torch.autograd import Variable
dtype = torch.FloatTensor

sentences = ["i like dog", "i love coffee", "i hate milk"]  # 句子数据集
n_steps = 2  # 用前几个单词来预测下一个单词，e.g. 2个
n_hidden = 2  # 隐藏层的节点个数，e.g. 2个
m = 2  # 词向量的长度

word_list = " ".join(sentences).split(" ")  # 获取所有的单词
print("未去重词表：", word_list)
word_list = list(set(word_list))  # 去重
print("去重词表：", word_list)
word_dict = {w: i for i, w in enumerate(word_list)}  # 单词->索引
print("单词索引：", word_dict)
number_dict = {i: w for i, w in enumerate(word_list)}  # 索引->单词
print("索引单词：", number_dict)
num_words = len(word_dict)  # 单词总数
print("单词总数：", num_words)

未去重词表： ['i', 'like', 'dog', 'i', 'love', 'coffee', 'i', 'hate', 'milk']
去重词表： ['milk', 'coffee', 'hate', 'dog', 'love', 'i', 'like']
单词索引： {'milk': 0, 'coffee': 1, 'hate': 2, 'dog': 3, 'love': 4, 'i': 5, 'like': 6}
索引单词： {0: 'milk', 1: 'coffee', 2: 'hate', 3: 'dog', 4: 'love', 5: 'i', 6: 'like'}
单词总数： 7


In [4]:
# 模型结构
class NNLM(nn.Module):
  # NNLM model architecture
  def __init__(self):
    super(NNLM, self).__init__()
    self.C = nn.Embedding(num_embeddings = num_words, embedding_dim = m)  # 词表
    self.d = nn.Parameter(torch.randn(n_hidden).type(dtype))  # 隐藏层的偏置
    self.H = nn.Parameter(torch.randn(n_steps * m, n_hidden).type(dtype))  # 输入层到隐藏层的权重
    self.U = nn.Parameter(torch.randn(n_hidden, num_words).type(dtype))  # 隐藏层到输出层的权重
    self.b = nn.Parameter(torch.randn(num_words).type(dtype))  # 输出层的偏置
    self.W = nn.Parameter(torch.randn(n_steps * m, num_words).type(dtype))  # 输入层到输出层的权重

  def forward(self, input):
    '''
    input: [batchsize, n_steps] 
    x: [batchsize, n_steps*m]
    hidden_layer: [batchsize, n_hidden]
    output: [batchsize, num_words]
    '''
    x = self.C(input)  # 获得一个batch的词向量的词表
    x = x.view(-1, n_steps * m)
    hidden_out = torch.tanh(torch.mm(x, self.H) + self.d)  # 获取隐藏层输出
    output = torch.mm(x, self.W) + torch.mm(hidden_out, self.U) + self.b  # 获得输出层输出
    return output
  
# 格式化输入数据
def make_batch(sentences):
  '''
  input_batch：一组batch中前n_steps个单词的索引
  target_batch：一组batch中每句话待预测单词的索引
  '''
  input_batch = []
  target_batch = []
  for sentence in sentences:
    word = sentence.split()
    input = [word_dict[w] for w in word[:-1]]
    target = word_dict[word[-1]]
    input_batch.append(input)
    target_batch.append(target)
  return input_batch, target_batch

input_batch, target_batch = make_batch(sentences)
input_batch = torch.LongTensor(input_batch)
target_batch = torch.LongTensor(target_batch)
print("input_batch:", input_batch)
print("target_batch:", target_batch)


input_batch: tensor([[5, 6],
        [5, 4],
        [5, 2]])
target_batch: tensor([3, 1, 0])


In [5]:
# 训练模型
model = NNLM()

criterion = nn.CrossEntropyLoss()  # 使用cross entropy作为loss function
optimizer = optim.Adam(model.parameters(), lr = 0.001)  # 使用Adam作为optimizer

for epoch in range(2000):
  # 梯度清零
  optimizer.zero_grad()
  # 计算predication
  output = model(input_batch)
  # 计算loss
  loss = criterion(output, target_batch)
  if (epoch + 1) % 100 == 0:
    print("Epoch:{}".format(epoch+1), "Loss:{:.3f}".format(loss))
  # 反向传播
  loss.backward()
  # 更新权重参数
  optimizer.step()

Epoch:100 Loss:3.231
Epoch:200 Loss:2.375
Epoch:300 Loss:1.739
Epoch:400 Loss:1.158
Epoch:500 Loss:0.680
Epoch:600 Loss:0.404
Epoch:700 Loss:0.255
Epoch:800 Loss:0.173
Epoch:900 Loss:0.125
Epoch:1000 Loss:0.094
Epoch:1100 Loss:0.074
Epoch:1200 Loss:0.059
Epoch:1300 Loss:0.049
Epoch:1400 Loss:0.041
Epoch:1500 Loss:0.034
Epoch:1600 Loss:0.029
Epoch:1700 Loss:0.026
Epoch:1800 Loss:0.022
Epoch:1900 Loss:0.020
Epoch:2000 Loss:0.017


In [6]:
# 推理
pred = model(input_batch).data.max(1, keepdim=True)[1]  # 找出概率最大的下标
print("Predict:", pred)
print([sentence.split()[:2] for sentence in sentences], "---->", [number_dict[n.item()] for n in pred.squeeze()])

Predict: tensor([[3],
        [1],
        [0]])
[['i', 'like'], ['i', 'love'], ['i', 'hate']] ----> ['dog', 'coffee', 'milk']
