### 命名实体识别

In [2]:
import spacy

nlp = spacy.load('zh_core_web_sm')
text = "下周三晚上给第二个Keynote定稿"
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)



下周三晚上 0 5 TIME
第二 6 8 ORDINAL


### 命名实体识别 - 模式匹配

In [3]:
import re

pattern = r'^\d{12}[A-Z]{3}$' # 这个正则表达式匹配12个数字后跟3个大写字母

text = "你好，我有一个订单一直没有收到，订单号是202303251200ABC"
doc = nlp(text)
for word in doc:
    match = re.search(pattern, word.text)
    if match:
        print("订单号: ", match.group())

订单号:  202303251200ABC


### 命名实体识别 - 直接通过ChatGPT的API来做

In [12]:
import openai, os
openai.api_key = os.environ.get("OPENAI_API_KEY")

COMPLETION_MODEL = "text-davinci-003"
def get_response(prompt, temperature = 1.0, stop=None):
    completions = openai.Completion.create(
        engine=COMPLETION_MODEL,
        prompt=prompt,
        max_tokens=1024,
        n=1,
        stop=stop,
        temperature=temperature,
    )
    message = completions['choices'][0]['text']
    return message

text = "你好，我有一个订单一直没有收到，订单号是202303251200ABC"

print(get_response("请从下面的文本中提取出用户的订单号，并以json形式显示：\n\n" + text))

。

"orderNumber": "202303251200ABC"


### 分词找一些有用的关键词

In [4]:
text = '''外形外观：紫色，非常大气，方方正正，四周圆角
屏幕音效：清晰度分辨率都挺满意，听音乐效果也不错
拍照效果：非常好
运行速度：很流畅
待机时间：之前的安卓手机用了近三年，一天3充，现在妥妥的一天一充'''

useful_words_list = ['大气', '满意', '好', '流畅']

doc = nlp(text)
for word in doc:
    if word.text in useful_words_list:
        print(word)

大气
满意
好
流畅


### 计算 N-Grams

In [8]:
text = '''外形外观：紫色，非常大气，方方正正，四周圆角
屏幕音效：清晰度分辨率都挺满意，听音乐效果也不错
拍照效果：非常好
运行速度：很流畅
待机时间：之前的安卓手机用了近三年，一天3充，现在妥妥的一天一充'''

from nltk.util import ngrams

doc = nlp(text)

words = []
for word in doc:
    words.append(word.text)

list(ngrams(words, 2))


[('外形', '外观'),
 ('外观', '：'),
 ('：', '紫色'),
 ('紫色', '，'),
 ('，', '非常'),
 ('非常', '大气'),
 ('大气', '，'),
 ('，', '方方正正'),
 ('方方正正', '，'),
 ('，', '四周'),
 ('四周', '圆角'),
 ('圆角', '\n'),
 ('\n', '屏幕'),
 ('屏幕', '音效'),
 ('音效', '：'),
 ('：', '清晰度'),
 ('清晰度', '分辨率'),
 ('分辨率', '都'),
 ('都', '挺'),
 ('挺', '满意'),
 ('满意', '，'),
 ('，', '听'),
 ('听', '音乐'),
 ('音乐', '效果'),
 ('效果', '也'),
 ('也', '不错'),
 ('不错', '\n'),
 ('\n', '拍照'),
 ('拍照', '效果'),
 ('效果', '：'),
 ('：', '非常'),
 ('非常', '好'),
 ('好', '\n'),
 ('\n', '运行'),
 ('运行', '速度'),
 ('速度', '：'),
 ('：', '很'),
 ('很', '流畅'),
 ('流畅', '\n'),
 ('\n', '待机'),
 ('待机', '时间'),
 ('时间', '：'),
 ('：', '之前'),
 ('之前', '的'),
 ('的', '安卓'),
 ('安卓', '手机'),
 ('手机', '用'),
 ('用', '了'),
 ('了', '近'),
 ('近', '三年'),
 ('三年', '，'),
 ('，', '一'),
 ('一', '天'),
 ('天', '3'),
 ('3', '充'),
 ('充', '，'),
 ('，', '现在'),
 ('现在', '妥妥'),
 ('妥妥', '的'),
 ('的', '一'),
 ('一', '天一'),
 ('天一', '充')]

### TF-IDF 演示

In [19]:
doc_a = 'this document is first document'
doc_b = 'this document is the second document'

bag_of_words_a = doc_a.split(' ')
bag_of_words_b = doc_b.split(' ')

unique_words_set = set(bag_of_words_a).union(set(bag_of_words_b))
print(unique_words_set)

# Now create a dictionary of words and their occurence for each document in the corpus (collection of documents).

dict_a = dict.fromkeys(unique_words_set, 0)
# print(dict_a) # {'this': 0, 'document': 0, 'second': 0, 'is': 0, 'the': 0}

for word in bag_of_words_a:
    dict_a[word] += 1

print(dict_a)
# {'this': 1, 'document': 2, 'second': 1, 'is': 1, 'the': 1}

# similarly

dict_b = dict.fromkeys(unique_words_set, 0)

for word in bag_of_words_b:
    dict_b[word] += 1

print(dict_b)

{'the', 'is', 'first', 'this', 'second', 'document'}
{'the': 0, 'is': 1, 'first': 1, 'this': 1, 'second': 0, 'document': 2}
{'the': 1, 'is': 1, 'first': 0, 'this': 1, 'second': 1, 'document': 2}


In [20]:
def compute_term_frequency(word_dictionary, bag_of_words):
    term_frequency_dictionary = {}
    length_of_bag_of_words = len(bag_of_words)

    for word, count in word_dictionary.items():
        term_frequency_dictionary[word] = count / float(length_of_bag_of_words)

    return term_frequency_dictionary

# Implementation

print(compute_term_frequency(dict_a, bag_of_words_a))

{'the': 0.0, 'is': 0.2, 'first': 0.2, 'this': 0.2, 'second': 0.0, 'document': 0.4}


In [28]:
import math

def compute_inverse_document_frequency(full_doc_list):
    idf_dict = {}
    length_of_doc_list = len(full_doc_list)

    # Initialize dictionary with words from all documents
    idf_dict = dict.fromkeys(full_doc_list[0].keys(), 0)
    
    # Count the number of documents that contain each word
    for doc in full_doc_list:
        for word, val in doc.items():
            if val > 0:
                idf_dict[word] += 1

    # Compute IDF for each word
    for word, val in idf_dict.items():
        idf_dict[word] = math.log((length_of_doc_list+1) / (float(val) + 1))

    return idf_dict

final_idf_dict = compute_inverse_document_frequency([dict_a, dict_b])

import json

# Print the final_idf_dict dictionary in a pretty format
print(json.dumps(final_idf_dict, indent=2))

{
  "the": 0.4054651081081644,
  "is": 0.0,
  "first": 0.4054651081081644,
  "this": 0.0,
  "second": 0.4054651081081644,
  "document": 0.0
}


### 直接使用TF/IDF的库

In [13]:
import pandas as pd
import numpy as np

datafile_path = "data/fine_food_reviews_with_embeddings_1k.csv"

df = pd.read_csv(datafile_path)
df['embedding'] = df.embedding.apply(eval).apply(np.array)

In [14]:
df.head

<bound method NDFrame.head of      Unnamed: 0   ProductId          UserId  Score   
0             0  B003XPF9BO  A3R7JR3FMEBXQB      5  \
1           297  B003VXHGPK  A21VWSCGW7UUAR      4   
2           296  B008JKTTUA  A34XBAIFT02B60      1   
3           295  B000LKTTTW  A14MQ40CCU8B13      5   
4           294  B001D09KAM  A34XBAIFT02B60      1   
..          ...         ...             ...    ...   
995         623  B0000CFXYA  A3GS4GWPIBV0NT      1   
996         624  B0001BH5YM   A1BZ3HMAKK0NC      5   
997         625  B0009ET7TC  A2FSDQY5AI6TNX      5   
998         619  B007PA32L2  A15FF2P7RPKH6G      5   
999         999  B001EQ5GEO  A3VYU0VO6DYV6I      5   

                                               Summary   
0    where does one  start...and stop... with a tre...  \
1                     Good, but not Wolfgang Puck good   
2    Should advertise coconut as an ingredient more...   
3                                     Best tomato soup   
4    Should advertise coconut a

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = df['Summary']
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(tfidf_df)

      12   13   21   31   32   34   40   50   51   60  ...  yogurt  yorkie   
0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0     0.0  \
1    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0     0.0   
2    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0     0.0   
3    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0     0.0   
4    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0     0.0   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...     ...     ...   
995  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0     0.0   
996  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0     0.0   
997  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0     0.0   
998  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0     0.0   
999  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0     0.0   

     you  your  yuk  yum  yummmm  yummmmm  yummy  zipfizz  
0  

In [18]:
def get_top_n_words(row, tfidf_df, top_n=5):
    # Get the words and corresponding tf-idf scores for the given row
    words_and_scores = [(word, tfidf_df.loc[row, word]) for word in tfidf_df.columns]
    
    # Sort the words by their tf-idf scores
    words_and_scores.sort(key=lambda x: x[1], reverse=True)

    # Get the top n words
    top_n_words_and_scores = words_and_scores[:top_n]
    
    return top_n_words_and_scores

# Print the highest tf-idf words and their scores for the first 5 rows
for i in range(5):
    print(f"Row {i+1}: {get_top_n_words(i, tfidf_df)}")

Row 1: [('start', 0.394384915946285), ('stop', 0.394384915946285), ('where', 0.394384915946285), ('does', 0.35649943107170623), ('treat', 0.312176255125172)]
Row 2: [('good', 0.5681028016321258), ('puck', 0.49441945926150405), ('wolfgang', 0.49441945926150405), ('but', 0.3153408915839207), ('not', 0.2981914242226147)]
Row 3: [('advertise', 0.36559695823357646), ('ingredient', 0.36559695823357646), ('prominently', 0.36559695823357646), ('should', 0.36559695823357646), ('an', 0.3574098136743376)]
Row 4: [('tomato', 0.6610768832907321), ('soup', 0.6331508937690398), ('best', 0.4026130898245583), ('12', 0.0), ('13', 0.0)]
Row 5: [('advertise', 0.36559695823357646), ('ingredient', 0.36559695823357646), ('prominently', 0.36559695823357646), ('should', 0.36559695823357646), ('an', 0.3574098136743376)]


### Word2Vec的演示