In [1]:
TARGET_COMPANY_ID = 174438
WALMART_COMPANY_ID = 313055
KOHL_COMPANY_ID = 61243

### 抓資料並整合時間

In [2]:
import pandas as pd
from sas7bdat import SAS7BDAT

def find_transcript_and_keydev_ids(company_id: int):
    transcript_ids = []
    keyDev_ids = []
    with SAS7BDAT("/Users/joeyyyy/Documents/conference call/identifiers/CompanyID_TranscriptID_KeyDevID.sas7bdat", skip_header=True) as reader:
        for row in reader:
            if row[2] and company_id == row[0]:
                    transcript_ids.append(int(row[1])) 
                    keyDev_ids.append(int(row[3]))

    print(f"length of transcript ids: {len(transcript_ids)}")
    print(f"length of keyDev ids: {len(keyDev_ids)}")
    return transcript_ids, keyDev_ids

In [3]:
def find_transcript_creation_date(transcript_ids: list):
    ciqtranscript_df = pd.read_csv("/Users/joeyyyy/Documents/conference call/transcript_table/0_ciqtranscript.csv")
    filtered_df = ciqtranscript_df[ciqtranscript_df["transcriptid"].isin(transcript_ids)]
    return filtered_df

In [4]:
def find_transcripts(transcript_ids: list, from_index: int=1, to_index: int=100):
    result_df = pd.DataFrame()
    for i in range(from_index, to_index):
        df = pd.read_csv(f"{i}00000ciqtranscriptcomponent.csv")
        # 過濾符合 transcript id 的行
        filtered_df = df[df["transcriptid"].isin(transcript_ids)]
        
        # 將篩選出的結果附加到 result_df
        result_df = pd.concat([result_df, filtered_df], ignore_index=True)

    return result_df

In [5]:
ciqtranscript_df = pd.read_csv("/Users/joeyyyy/Documents/conference call/transcript_table/0_ciqtranscript.csv")

In [6]:
target_transcript_ids, target_keyDev_ids = find_transcript_and_keydev_ids(TARGET_COMPANY_ID)
target_filtered_df = find_transcript_creation_date(target_transcript_ids)
target_df = find_transcripts(target_transcript_ids, from_index=1, to_index=100)
target_df

length of transcript ids: 94
length of keyDev ids: 94


Unnamed: 0.1,Unnamed: 0,transcriptcomponentid,transcriptid,componentorder,transcriptcomponenttypeid,transcriptpersonid,componenttext
0,83961,316017,8652,8,4,24571.0,A couple comments to put some color around tha...
1,83962,316013,8652,4,2,24635.0,Thanks Doug. As Bob and Doug have just describ...
2,83963,316021,8652,12,4,24635.0,"Well, we’re not expecting any margin expansion..."
3,83964,316025,8652,16,3,6365.0,Okay. Great. Thank you.\r\n
4,83965,316029,8652,20,3,2636.0,"Just on the aging, I mean what’s the compositi..."
...,...,...,...,...,...,...,...
1169,46221,13101540,262640,17,3,204598.0,My question is on the PFresh remodels. And I'm...
1170,46222,13101544,262640,21,3,204598.0,"Okay. And then just if I missed this, how many..."
1171,46223,13101548,262640,25,4,113424.0,"Well, at a big picture level, for we've really..."
1172,46224,13101552,262640,29,7,1.0,Your next question comes from the line of Robe...


In [7]:
target_df = target_df.merge(ciqtranscript_df[["transcriptid", "transcriptcreationdateutc"]], on="transcriptid", how="left")
target_df

Unnamed: 0.1,Unnamed: 0,transcriptcomponentid,transcriptid,componentorder,transcriptcomponenttypeid,transcriptpersonid,componenttext,transcriptcreationdateutc
0,83961,316017,8652,8,4,24571.0,A couple comments to put some color around tha...,2008-06-20 01:22:08
1,83962,316013,8652,4,2,24635.0,Thanks Doug. As Bob and Doug have just describ...,2008-06-20 01:22:08
2,83963,316021,8652,12,4,24635.0,"Well, we’re not expecting any margin expansion...",2008-06-20 01:22:08
3,83964,316025,8652,16,3,6365.0,Okay. Great. Thank you.\r\n,2008-06-20 01:22:08
4,83965,316029,8652,20,3,2636.0,"Just on the aging, I mean what’s the compositi...",2008-06-20 01:22:08
...,...,...,...,...,...,...,...,...
1169,46221,13101540,262640,17,3,204598.0,My question is on the PFresh remodels. And I'm...,2012-02-23 16:38:20
1170,46222,13101544,262640,21,3,204598.0,"Okay. And then just if I missed this, how many...",2012-02-23 16:38:20
1171,46223,13101548,262640,25,4,113424.0,"Well, at a big picture level, for we've really...",2012-02-23 16:38:20
1172,46224,13101552,262640,29,7,1.0,Your next question comes from the line of Robe...,2012-02-23 16:38:20


In [8]:
len(target_df["transcriptid"].unique())

30

In [9]:
walmart_transcript_ids, target_keyDev_ids = find_transcript_and_keydev_ids(WALMART_COMPANY_ID)
walmart_filtered_df = find_transcript_creation_date(walmart_transcript_ids)
walmart_df = find_transcripts(walmart_transcript_ids, from_index=1, to_index=100)
walmart_df = walmart_df.merge(ciqtranscript_df[["transcriptid", "transcriptcreationdateutc"]], on="transcriptid", how="left")
walmart_df

length of transcript ids: 141
length of keyDev ids: 141


Unnamed: 0.1,Unnamed: 0,transcriptcomponentid,transcriptid,componentorder,transcriptcomponenttypeid,transcriptpersonid,componenttext,transcriptcreationdateutc
0,44206,79112,9572,1,5,,This call is the property of Wal-Mart Stores I...,2008-06-19 23:01:30
1,44207,79113,9574,1,5,,"Thank you for calling Wal-Mart Stores, Inc. Fi...",2008-06-19 23:01:30
2,85161,790274,11432,1,5,,"Thank you for calling Wal-Mart Stores, Inc. se...",2008-08-22 21:59:16
3,21618,1047869,14551,1,5,,"This call is the property of Wal-Mart Stores, ...",2008-11-13 18:25:22
4,84837,1647127,22214,1,5,,"Welcome to the Wal-Mart Stores, Inc. Earnings ...",2009-05-14 19:18:38
...,...,...,...,...,...,...,...,...
749,73732,13011009,260660,0,1,1.0,Welcome to the Walmart Earnings Call Fourth Qu...,2012-02-21 14:13:06
750,73733,13011013,260660,6,2,229062.0,"Thanks, Doug. It is an honor and a privilege t...",2012-02-21 14:13:06
751,73734,13011010,260660,1,2,113006.0,"Hello, this is Carol Schumacher, Vice Presiden...",2012-02-21 14:13:06
752,73735,13011014,260660,7,2,113007.0,"Thanks, Ross. We are pleased with our full yea...",2012-02-21 14:13:06


In [10]:
len(walmart_df["transcriptid"].unique())

23

In [11]:
kohl_transcript_ids, target_keyDev_ids = find_transcript_and_keydev_ids(KOHL_COMPANY_ID)
kohl_filtered_df = find_transcript_creation_date(kohl_transcript_ids)
kohl_df = find_transcripts(kohl_transcript_ids, from_index=1, to_index=100)
kohl_df

length of transcript ids: 84
length of keyDev ids: 84


Unnamed: 0.1,Unnamed: 0,transcriptcomponentid,transcriptid,componentorder,transcriptcomponenttypeid,transcriptpersonid,componenttext
0,87306,830546,11460,45,4,47497.0,I wouldn’t characterize anything as a big surp...
1,87307,830550,11460,49,3,6532.0,Is there a risk that you’ve got some benefit f...
2,87308,830554,11460,53,4,47505.0,We’re in our third phase of innovation in our ...
3,87309,830558,11460,57,7,1.0,Your next question comes from the line of Dana...
4,87310,830562,11460,61,4,47497.0,We think about the whole thing as kind of inve...
...,...,...,...,...,...,...,...
1509,43549,13096439,262508,10,7,1.0,Your next question comes from the line of Adri...
1510,43550,13096443,262508,14,4,113022.0,She's going to see in all of the elements of o...
1511,43551,13096447,262508,18,7,1.0,Your next question comes from the line of Lorr...
1512,43552,13096451,262508,22,3,101671.0,"In terms of 2011, one of the positive standout..."


In [12]:
kohl_df = kohl_df.merge(ciqtranscript_df[["transcriptid", "transcriptcreationdateutc"]], on="transcriptid", how="left")
kohl_df

Unnamed: 0.1,Unnamed: 0,transcriptcomponentid,transcriptid,componentorder,transcriptcomponenttypeid,transcriptpersonid,componenttext,transcriptcreationdateutc
0,87306,830546,11460,45,4,47497.0,I wouldn’t characterize anything as a big surp...,2008-08-22 22:20:55
1,87307,830550,11460,49,3,6532.0,Is there a risk that you’ve got some benefit f...,2008-08-22 22:20:55
2,87308,830554,11460,53,4,47505.0,We’re in our third phase of innovation in our ...,2008-08-22 22:20:55
3,87309,830558,11460,57,7,1.0,Your next question comes from the line of Dana...,2008-08-22 22:20:55
4,87310,830562,11460,61,4,47497.0,We think about the whole thing as kind of inve...,2008-08-22 22:20:55
...,...,...,...,...,...,...,...,...
1509,43549,13096439,262508,10,7,1.0,Your next question comes from the line of Adri...,2012-02-23 14:48:55
1510,43550,13096443,262508,14,4,113022.0,She's going to see in all of the elements of o...,2012-02-23 14:48:55
1511,43551,13096447,262508,18,7,1.0,Your next question comes from the line of Lorr...,2012-02-23 14:48:55
1512,43552,13096451,262508,22,3,101671.0,"In terms of 2011, one of the positive standout...",2012-02-23 14:48:55


In [13]:
kohl_df["transcriptcreationdateutc"] = pd.to_datetime(kohl_df["transcriptcreationdateutc"])
kohl_2009_df = kohl_df[kohl_df['transcriptcreationdateutc'].dt.year == 2009]
kohl_2009_df.to_csv("kohl_2009.csv")
kohl_2009_df

Unnamed: 0.1,Unnamed: 0,transcriptcomponentid,transcriptid,componentorder,transcriptcomponenttypeid,transcriptpersonid,componenttext,transcriptcreationdateutc
187,39749,1262186,17467,62,3,85361.0,Is Simply Vera along those lines as well?\r\n,2009-02-27 05:48:51
188,39750,1262190,17467,66,7,1.0,Our next question comes from Daniel Binder - J...,2009-02-27 05:48:51
189,39751,1262194,17467,70,4,85359.0,Cash is good. More cash is better than not as...,2009-02-27 05:48:51
190,39752,1262178,17467,54,4,47497.0,"I think the short answer is yes, it’s definite...",2009-02-27 05:48:51
191,39753,1262182,17467,58,4,47497.0,It’s not really just about the extra day it’s ...,2009-02-27 05:48:51
...,...,...,...,...,...,...,...,...
546,72853,2955666,41727,107,3,97568.0,October's business was probably limited somewh...,2009-11-12 21:42:14
547,72854,2955670,41727,111,3,97568.0,"I know you're always working on new stuff, do ...",2009-11-12 21:42:14
548,72855,2955674,41727,115,4,113023.0,"No, that's it. Thanks, everybody. Have a good ...",2009-11-12 21:42:14
549,72856,2955574,41727,15,3,2725.0,Is there any negative impact from the lack of ...,2009-11-12 21:42:14


In [14]:
target_df["transcriptcreationdateutc"] = pd.to_datetime(target_df["transcriptcreationdateutc"])
target_2009_df = target_df[target_df['transcriptcreationdateutc'].dt.year == 2009]
target_2009_df.to_csv("target_2009.csv")
target_2009_df

Unnamed: 0.1,Unnamed: 0,transcriptcomponentid,transcriptid,componentorder,transcriptcomponenttypeid,transcriptpersonid,componenttext,transcriptcreationdateutc
69,18632,3316262,17205,50,7,1.0,Your next question comes from Daniel Binder - ...,2009-12-10 21:18:42
70,18633,3316238,17205,26,4,59516.0,Again based on how precipitously our November ...,2009-12-10 21:18:42
71,18634,3316230,17205,18,4,59484.0,The way I would think of it keying off of Greg...,2009-12-10 21:18:42
72,18635,3316234,17205,22,3,10920.0,And that’s your expectation to stick with over...,2009-12-10 21:18:42
73,18636,3316222,17205,10,4,59516.0,I think you can still plan on an increase in t...,2009-12-10 21:18:42
...,...,...,...,...,...,...,...,...
330,94316,2989901,42219,54,3,113227.0,"And my second question, Doug, can you talk a l...",2009-11-17 22:01:00
331,94317,2989905,42219,58,3,113227.0,"And my final question, Kathy, if you could jus...",2009-11-17 22:01:00
332,94318,2989909,42219,62,4,113424.0,2010 store opening program is essentially alre...,2009-11-17 22:01:00
333,94319,2989913,42219,66,3,3259.0,On the expectations for the SG&A mid-single-di...,2009-11-17 22:01:00


In [15]:
walmart_df["transcriptcreationdateutc"] = pd.to_datetime(walmart_df["transcriptcreationdateutc"])
walmart_2009_df = walmart_df[walmart_df['transcriptcreationdateutc'].dt.year == 2009]
walmart_2009_df.to_csv("walmart_2009.csv")
walmart_2009_df

Unnamed: 0.1,Unnamed: 0,transcriptcomponentid,transcriptid,componentorder,transcriptcomponenttypeid,transcriptpersonid,componenttext,transcriptcreationdateutc
4,84837,1647127,22214,1,5,,"Welcome to the Wal-Mart Stores, Inc. Earnings ...",2009-05-14 19:18:38
5,43019,2154747,29741,3,2,113010.0,"Thank you very much, Carol. Welcome, everyone,...",2009-08-13 19:30:50
6,43020,2154751,29741,7,2,113005.0,"Thank you, Doug. I’m pleased with the underlyi...",2009-08-13 19:30:50
7,43021,2154752,29741,8,2,113011.0,"Thanks, Brian. Let’s start with the review of...",2009-08-13 19:30:50
8,43022,2154748,29741,4,2,113007.0,"Thanks, Mike. Total net sales for the company ...",2009-08-13 19:30:50
9,43023,2154745,29741,1,1,1.0,"Welcome to the Wal-Mart Stores, Inc. Earnings ...",2009-08-13 19:30:50
10,43024,2154749,29741,5,2,113009.0,"Thank you, Charles. We are pleased with the fi...",2009-08-13 19:30:50
11,43025,2154746,29741,2,2,113006.0,"Hello, this is Carol Schumacher, Vice Presiden...",2009-08-13 19:30:50
12,43026,2154750,29741,6,2,113008.0,"Thanks, Eduardo. I’ll cover our quarterly resu...",2009-08-13 19:30:50
13,73580,2957395,41754,2,2,113006.0,"Hi, this is Carol Schumacher, Vice President o...",2009-11-13 00:44:42


### Word2Vec

In [16]:
print(f"target's transcript in 2009: shape: {target_2009_df.shape}, length: {len(target_2009_df["transcriptid"].unique())}")
print(f"walmart's transcript in 2009: shape: {walmart_2009_df.shape}, length: {len(walmart_2009_df["transcriptid"].unique())}")
print(f"kohl's transcript in 2009: shape: {kohl_2009_df.shape}, length: {len(kohl_2009_df["transcriptid"].unique())}")

target's transcript in 2009: shape: (266, 8), length: 4
walmart's transcript in 2009: shape: (17, 8), length: 3
kohl's transcript in 2009: shape: (364, 8), length: 4


In [17]:
target_2009_df["transcriptcreationdateutc"].unique()

<DatetimeArray>
['2009-12-10 21:18:42', '2009-05-20 21:53:01', '2009-08-18 22:34:13',
 '2009-11-17 22:01:00']
Length: 4, dtype: datetime64[ns]

In [18]:
walmart_2009_df["transcriptcreationdateutc"].unique()

<DatetimeArray>
['2009-05-14 19:18:38', '2009-08-13 19:30:50', '2009-11-13 00:44:42']
Length: 3, dtype: datetime64[ns]

In [19]:
kohl_2009_df["transcriptcreationdateutc"].unique()

<DatetimeArray>
['2009-02-27 05:48:51', '2009-05-14 17:18:12', '2009-08-13 21:58:24',
 '2009-11-12 21:42:14']
Length: 4, dtype: datetime64[ns]

In [20]:
target_2009_chosen_df = target_2009_df[target_2009_df["transcriptcreationdateutc"] == "2009-11-17 22:01:00"]
target_2009_chosen_df

Unnamed: 0.1,Unnamed: 0,transcriptcomponentid,transcriptid,componentorder,transcriptcomponenttypeid,transcriptpersonid,componenttext,transcriptcreationdateutc
262,94248,2989850,42219,3,2,113426.0,"Thanks, Gregg. Our third quarter results demon...",2009-11-17 22:01:00
263,94249,2989854,42219,7,3,104597.0,First question is really on the discretionary ...,2009-11-17 22:01:00
264,94250,2989858,42219,11,4,113424.0,"As you know, we don't often like to talk about...",2009-11-17 22:01:00
265,94251,2989862,42219,15,3,101669.0,"Just, Doug, on the accelerated depreciation yo...",2009-11-17 22:01:00
266,94252,2989870,42219,23,4,113424.0,Most of that mix improvement was driven by a c...,2009-11-17 22:01:00
...,...,...,...,...,...,...,...,...
330,94316,2989901,42219,54,3,113227.0,"And my second question, Doug, can you talk a l...",2009-11-17 22:01:00
331,94317,2989905,42219,58,3,113227.0,"And my final question, Kathy, if you could jus...",2009-11-17 22:01:00
332,94318,2989909,42219,62,4,113424.0,2010 store opening program is essentially alre...,2009-11-17 22:01:00
333,94319,2989913,42219,66,3,3259.0,On the expectations for the SG&A mid-single-di...,2009-11-17 22:01:00


In [21]:
walmart_2009_chosen_df = walmart_2009_df[walmart_2009_df["transcriptcreationdateutc"] == "2009-11-13 00:44:42"]
walmart_2009_chosen_df

Unnamed: 0.1,Unnamed: 0,transcriptcomponentid,transcriptid,componentorder,transcriptcomponenttypeid,transcriptpersonid,componenttext,transcriptcreationdateutc
13,73580,2957395,41754,2,2,113006.0,"Hi, this is Carol Schumacher, Vice President o...",2009-11-13 00:44:42
14,73581,2957399,41754,6,2,113008.0,"Thanks, Eduardo. Our International segment ass...",2009-11-13 00:44:42
15,73582,2957396,41754,3,2,113010.0,"Thank you, Carol, and thank you, everyone, for...",2009-11-13 00:44:42
16,73583,2957400,41754,7,2,113005.0,"Thank you, Doug. We are very excited about the...",2009-11-13 00:44:42
17,73584,2957397,41754,4,2,113007.0,"Thanks, Mike. Diluted net income per share for...",2009-11-13 00:44:42
18,73585,2957401,41754,8,2,113011.0,"Thanks a lot, Brian. Three weeks ago at our me...",2009-11-13 00:44:42
19,73586,2957394,41754,1,1,1.0,Welcome to the Wal-Mart Earnings Call for the ...,2009-11-13 00:44:42
20,73587,2957398,41754,5,2,113009.0,"Thank you, Charles. Our Wal-Mart U.S. top line...",2009-11-13 00:44:42


In [22]:
kohl_2009_chosen_df = kohl_2009_df[kohl_2009_df["transcriptcreationdateutc"] == "2009-11-12 21:42:14"]
kohl_2009_chosen_df

Unnamed: 0.1,Unnamed: 0,transcriptcomponentid,transcriptid,componentorder,transcriptcomponenttypeid,transcriptpersonid,componenttext,transcriptcreationdateutc
435,72742,2955603,41727,44,3,101782.0,"A few questions. First, traffic gains during t...",2009-11-12 21:42:14
436,72743,2955607,41727,48,3,3259.0,"Couple questions. First, on the Women's busine...",2009-11-12 21:42:14
437,72744,2955611,41727,52,3,3259.0,How much of it -- I think you said flat in the...,2009-11-12 21:42:14
438,72745,2955615,41727,56,7,1.0,Your next response is from Adrianne Shapira wi...,2009-11-12 21:42:14
439,72746,2955619,41727,60,4,113022.0,"I would think about, very similarly, to last y...",2009-11-12 21:42:14
...,...,...,...,...,...,...,...,...
546,72853,2955666,41727,107,3,97568.0,October's business was probably limited somewh...,2009-11-12 21:42:14
547,72854,2955670,41727,111,3,97568.0,"I know you're always working on new stuff, do ...",2009-11-12 21:42:14
548,72855,2955674,41727,115,4,113023.0,"No, that's it. Thanks, everybody. Have a good ...",2009-11-12 21:42:14
549,72856,2955574,41727,15,3,2725.0,Is there any negative impact from the lack of ...,2009-11-12 21:42:14


In [23]:
print(f"chosen target's transcript in 2009: shape: {target_2009_chosen_df.shape}, length: {len(target_2009_chosen_df["transcriptid"].unique())}")
print(f"chosen walmart's transcript in 2009: shape: {walmart_2009_chosen_df.shape}, length: {len(walmart_2009_chosen_df["transcriptid"].unique())}")
print(f"chosen kohl's transcript in 2009: shape: {kohl_2009_chosen_df.shape}, length: {len(kohl_2009_chosen_df["transcriptid"].unique())}")

chosen target's transcript in 2009: shape: (73, 8), length: 1
chosen walmart's transcript in 2009: shape: (8, 8), length: 1
chosen kohl's transcript in 2009: shape: (116, 8), length: 1


#### 資料前處理

In [24]:
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.metrics.pairwise import cosine_similarity


In [25]:
# Load pre-trained DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [26]:
def tokenize_text(text_data):
    sentences = [sent for text in text_data if pd.notna(text) for sent in sent_tokenize(text)]  # 過濾NaN並進行句子分割
    return sentences

def lemmatize_text(sentences):
    preprocessed_sentences = []
    
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        lemmatized_sentence = " ".join([lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words])
        preprocessed_sentences.append(lemmatized_sentence)
    
    return preprocessed_sentences

In [27]:
# target_2009_sentences = tokenize_text(target_2009_chosen_df['componenttext'].tolist())
# walmart_2009_sentences = tokenize_text(walmart_2009_chosen_df['componenttext'].tolist())
# kohl_2009_sentences = tokenize_text(kohl_2009_chosen_df['componenttext'].tolist())
target_2009_sentences = tokenize_text(target_2009_df["componenttext"].tolist())
walmart_2009_sentences = tokenize_text(walmart_2009_df["componenttext"].tolist())
kohl_2009_sentences = tokenize_text(kohl_2009_df["componenttext"].tolist())
print(f"tokenized length of target: {len(walmart_2009_sentences)}")
print(f"tokenized length of walmart: {len(target_2009_sentences)}")
print(f"tokenized length of kohl: {len(kohl_2009_sentences)}")

tokenized length of target: 1324
tokenized length of walmart: 1459
tokenized length of kohl: 1568


In [28]:
target_2009_sentences[:5]

['Your next question comes from Daniel Binder - Jefferies & Co.',
 'Again based on how precipitously our November and December sales dropped off from prior trends, we could not - we didn’t anticipate that kind of fall off.',
 'And so we had goods in the pipeline that were coming regardless of the sales environment.',
 'We’ve been able to reset and get our spring sales forecast in the higher risk, seasonal discretionary categories established at a pretty consistent trend level.',
 'So we feel pretty good that we’ve got the sales and the receipts matched up appropriately as we look forward into 2009.']

In [29]:
target_2009_preprocessed = lemmatize_text(target_2009_sentences)
walmart_2009_preprocessed = lemmatize_text(walmart_2009_sentences)
kohl_2009_preprocessed = lemmatize_text(kohl_2009_sentences)
print(f"lemmatized length of target: {len(walmart_2009_preprocessed)}")
print(f"lemmatized length of walmart: {len(target_2009_preprocessed)}")
print(f"lemmatized length of kohl: {len(kohl_2009_preprocessed)}")

lemmatized length of target: 1324
lemmatized length of walmart: 1459
lemmatized length of kohl: 1568


In [30]:
target_2009_preprocessed[:5]

['next question come daniel binder jefferies co',
 'based precipitously november december sale dropped prior trend could anticipate kind fall',
 'good pipeline coming regardless sale environment',
 'able reset get spring sale forecast higher risk seasonal discretionary category established pretty consistent trend level',
 'feel pretty good got sale receipt matched appropriately look forward 2009']

In [31]:
# 將所有預處理過的句子合併成一個列表
all_preprocessed_sentences = target_2009_preprocessed + walmart_2009_preprocessed + kohl_2009_preprocessed
len(all_preprocessed_sentences)

4351

In [32]:
def extract_keywords(sentence):
    """
    從句子中提取可能的關鍵詞。
    這裡我們使用詞性標註來識別名詞和形容詞作為潛在的關鍵詞。
    """
    words = nltk.word_tokenize(sentence.lower())
    pos_tags = nltk.pos_tag(words)
    keywords = [word for word, pos in pos_tags if pos.startswith("NN") or pos.startswith("JJ") or pos.startswith("VB") or pos.startswith("RB")]
    return [lemmatizer.lemmatize(word) for word in keywords if word not in stop_words]

all_keywords = [keyword for sentence in all_preprocessed_sentences for keyword in extract_keywords(sentence)]
unique_keywords = list(set(all_keywords))
print(len(unique_keywords))

4025


In [33]:
from tqdm import tqdm

# 將單詞轉換為 DistilBERT 向量的函數
def get_word_embedding(word, tokenizer, model):
    inputs = tokenizer(word, return_tensors="tf")
    outputs = model(**inputs)
    return outputs.last_hidden_state[0][0].numpy()

# 計算 unique_keywords 中每個詞的嵌入向量
unique_keyword_embeddings = {}
for word in tqdm(unique_keywords, desc="Calculating embeddings for unique keywords"):
    unique_keyword_embeddings[word] = get_word_embedding(word, tokenizer, model)

key_words = ["supplier", "risk", "inventory", "customer"]
# 計算 key_words 中每個詞與 unique_keywords 中所有詞的相似度，並找出前 20 個最相似的詞
top_20_similar_words = {}
for key_word in key_words:
    key_word_embedding = get_word_embedding(key_word, tokenizer, model)
    similarities = {}
    for unique_word, embedding in unique_keyword_embeddings.items():
        similarity = cosine_similarity([key_word_embedding], [embedding])[0][0]
        similarities[unique_word] = similarity
    
    # 找出相似度最高的前 20 個詞
    sorted_similar_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    top_20_similar_words[key_word] = [word for word, sim in sorted_similar_words[:20] if sim > 0.5]

# 輸出結果
for key_word, similar_words in top_20_similar_words.items():
    print(f"Top 20 similar words for '{key_word}': {similar_words}")

Calculating embeddings for unique keywords: 100%|██████████| 4025/4025 [02:23<00:00, 28.01it/s]


Top 20 similar words for 'supplier': ['supplier', 'retailer', 'vendor', 'customer', 'provider', 'supply', 'payroll', 'prescription', 'otc', 'merchandise', 'fixture', 'apparel', 'mvc', 'company', 'wholesale', 'incorporates', 'accessory', 'builder', 'upgrade', 'imported']
Top 20 similar words for 'risk': ['risk', 'uncertainty', 'exposure', 'safety', 'stress', 'affect', 'liability', 'potential', 'outcome', 'monitoring', 'behavior', 'situation', 'compliance', 'relevancy', 'protection', 'concern', 'affecting', 'likelihood', 'portfolio', 'hindrance']
Top 20 similar words for 'inventory': ['inventory', 'catalog', 'payroll', 'stock', 'merchandise', 'retailer', 'information', 'item', 'sale', 'register', 'rental', 'area', 'indication', 'collection', 'luggage', 'turnover', 'assortment', 'warehouse', 'adjustment', 'commodity']
Top 20 similar words for 'customer': ['customer', 'consumer', 'provider', 'delivery', 'market', 'vendor', 'client', 'destination', 'employee', 'rental', 'offer', 'connection

In [48]:
len(top_20_similar_words["supplier"])

20

In [35]:
def find_common_words(key_words_similar):
    """
    找出在多個關鍵詞之間的共同詞。
    """
    # 儲存所有的共同詞
    common_words = {}
    
    # 將所有關鍵詞的相似詞轉換為集合形式，方便後續查找
    keyword_sets = {key: set(words) for key, words in key_words_similar.items()}
    
    # 獲取所有關鍵詞對的組合
    all_keywords = list(key_words_similar.keys())
    
    # 使用集合交集計算共同詞，避免重複比較
    for i in range(len(all_keywords)):
        for j in range(i + 1, len(all_keywords)):
            word1 = all_keywords[i]
            word2 = all_keywords[j]
            
            # 找出兩個集合的交集
            common = keyword_sets[word1].intersection(keyword_sets[word2])
            
            if common:
                common_words[(word1, word2)] = list(common)
    
    return common_words

# 使用先前定義的 key_words_similar
common_words = find_common_words(top_20_similar_words)

# 輸出每對關鍵詞之間的共同詞
for pair, words in common_words.items():
    print(f"Common words between '{pair[0]}' and '{pair[1]}': {', '.join(words)}")

Common words between 'supplier' and 'inventory': payroll, merchandise, retailer
Common words between 'supplier' and 'customer': customer, supply, provider, wholesale, supplier, vendor
Common words between 'risk' and 'customer': compliance
Common words between 'inventory' and 'customer': rental


In [36]:
from collections import defaultdict

def find_words_in_multiple_keywords(key_words_similar):
    """
    找出出現在多個 key_words 相似詞列表中的字，並記錄它們出現在哪些 key_words 中。
    """
    word_to_keywords = defaultdict(set)
    
    # 計算每個字在不同 key_words 相似詞列表中出現的次數及其所在的 key_words
    for key_word, similar_words in key_words_similar.items():
        unique_words = set(similar_words)  # 將每個列表轉換為集合，確保只計數一次
        for word in unique_words:
            word_to_keywords[word].add(key_word)
    
    # 過濾掉只出現在一個 key_word 的詞
    filtered_words = {word: keywords for word, keywords in word_to_keywords.items() if len(keywords) > 1}
    
    # 按照出現的 key_words 數量排序，出現多的排在前面
    sorted_words = sorted(filtered_words.items(), key=lambda x: len(x[1]), reverse=True)
    
    # 格式化輸出結果
    result = []
    for word, keywords in sorted_words:
        keywords_list = ", ".join(sorted(keywords))
        result.append(f"出現在 {keywords_list} 的詞有 {word}")
    
    return result

# 使用先前定義的 key_words_similar
common_words_info = find_words_in_multiple_keywords(top_20_similar_words)

# 輸出結果
for info in common_words_info:
    print(info)


出現在 customer, supplier 的詞有 customer
出現在 customer, supplier 的詞有 supply
出現在 inventory, supplier 的詞有 merchandise
出現在 customer, supplier 的詞有 wholesale
出現在 customer, supplier 的詞有 supplier
出現在 inventory, supplier 的詞有 payroll
出現在 inventory, supplier 的詞有 retailer
出現在 customer, supplier 的詞有 vendor
出現在 customer, supplier 的詞有 provider
出現在 customer, risk 的詞有 compliance
出現在 customer, inventory 的詞有 rental


In [37]:
from collections import Counter


def preprocess_sentence(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    return [word for word in tokens if word.isalnum() and word not in stop_words]

def count_key_words(sentence, seed_words):
    words = set(preprocess_sentence(sentence))
    return sum(1 for word in seed_words if word.lower() in words)

def get_top_sentences_with_seed_words(preprocessed_sentences, original_sentences, key_words_dict, top_n=5):
    results = {}
    for key, words in key_words_dict.items():
        seed_words = set(word.lower() for word in words)
        sentence_scores = [(original, preprocessed, count_key_words(preprocessed, seed_words)) 
                           for original, preprocessed in zip(original_sentences, preprocessed_sentences)]
        results[key] = sorted(sentence_scores, key=lambda x: x[2], reverse=True)[:top_n]
    return results

# def create_bag_of_words(preprocessed_sentences, original_sentences, key_words_dict):
#     bow_results = {}
#     for key, words in key_words_dict.items():
#         seed_words = set(word.lower() for word in words)
#         relevant_sentences = [(original, preprocessed) for original, preprocessed 
#                               in zip(original_sentences, preprocessed_sentences) 
#                               if count_key_words(preprocessed, seed_words) > 0]
#         all_words = [word for _, preprocessed in relevant_sentences for word in preprocess_sentence(preprocessed)]
#         bow_results[key] = Counter(all_words)
#     return bow_results


In [38]:
# 合併原始句子
all_original_sentences = target_2009_sentences + walmart_2009_sentences + kohl_2009_sentences

# 確保預處理後的句子和原始句子的數量相同
assert len(all_preprocessed_sentences) == len(all_original_sentences), "預處理後的句子數量與原始句子數量不符"

# 獲取每個關鍵字群的top句子
# top_sentences = get_top_sentences_with_seed_words(walmart_2009_preprocessed, walmart_2009_sentences, top_20_similar_words)

walmart_2009_chosen_sentences = tokenize_text(walmart_2009_chosen_df['componenttext'].tolist())
walmart_2009_chosen_preprocessed = lemmatize_text(walmart_2009_chosen_sentences)
top_sentences = get_top_sentences_with_seed_words(walmart_2009_chosen_preprocessed, walmart_2009_chosen_sentences, top_20_similar_words)

for key, sentences in top_sentences.items():
    print(f"\nTop 5 sentences for '{key}' keyword group:")
    for i, (original, preprocessed, count) in enumerate(sentences, 1):
        print(f"{i}. Score {count}: {original}")


Top 5 sentences for 'supplier' keyword group:
1. Score 2: Sales of food and consumables continued to lead growth, partially offset by softer sales in apparel and general merchandise items.The third quarter gross margin rate was flat to last year, despite a shift towards food and consumables.
2. Score 2: The shift to EDLP drove an increase in food sales of 3.6%, sales of general merchandise were relatively flat and apparel was down because of the overall economic slowdown.
3. Score 2: Wal-Mart's higher returns over the last decade are unique among many retailers and many other companies.
4. Score 2: Few companies, retailers or otherwise, have the momentum or opportunity that Wal-Mart has around the world.
5. Score 2: We are becoming a better company, making a difference in the community and in the lives of our customers and associates.

Top 5 sentences for 'risk' keyword group:
1. Score 3: These statements discuss, among other things, the anticipated comparable store sales for our Wal-

In [50]:
def find_top_words_in_sentences(preprocessed_sentences, key_words_dict, top_n=20):
    # 計算所有前處理句子中所有詞的出現次數
    all_words = [word for sentence in preprocessed_sentences for word in preprocess_sentence(sentence)]
    word_counter = Counter(all_words)
    
    # 收集所有關鍵字群中的種子詞
    all_seed_words = set(word.lower() for words in key_words_dict.values() for word in words)
    
    # 找出出現在種子詞字典內且在句子中出現頻率最高的前 top_n 個詞
    filtered_words = [(word, count) for word, count in word_counter.items() if word in all_seed_words]
    top_words = sorted(filtered_words, key=lambda x: x[1], reverse=True)[:top_n]
    
    return top_words

# 使用先前定義的 preprocessed_sentences 和 key_words_dict
top_words_in_sentences = find_top_words_in_sentences(walmart_2009_preprocessed, top_20_similar_words, top_n=10)

# 輸出結果
for key_word in key_words:
    print(f"\nTop 10 words that appear in sentences and are in '{key_word}' keyword group:")
    for word, count in top_words_in_sentences:
        if word in top_20_similar_words[key_word]:
            print(f"{word}: {count}")


Top 10 words that appear in sentences and are in 'supplier' keyword group:
customer: 155
company: 45
merchandise: 24

Top 10 words that appear in sentences and are in 'risk' keyword group:

Top 10 words that appear in sentences and are in 'inventory' keyword group:
sale: 329
inventory: 74
merchandise: 24
item: 19
area: 18

Top 10 words that appear in sentences and are in 'customer' keyword group:
customer: 155
comp: 80
market: 72
consumer: 26


In [40]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
model = SentenceTransformer("all-MiniLM-L6-v2")



In [42]:
def get_top_sentences(sentences, selected_sentence, n=5, similarity_threshold=0.99):
    # 對所有句子進行 embedding
    sentence_embeddings = model.encode(sentences)
    
    # 對 key_word_sentence 進行 embedding
    selected_sentence_embedding = model.encode(selected_sentence)
    
    # 計算相似度
    similarities = cosine_similarity([selected_sentence_embedding], sentence_embeddings)[0]

    # 過濾掉相似度為 1 的句子（避免完全相同的句子）
    filtered_indices = [i for i, sim in enumerate(similarities) if sim < similarity_threshold and sentences[i] != selected_sentence]

    # 根據過濾後的索引重新選取句子和相似度
    filtered_similarities = similarities[filtered_indices]
    filtered_sentences = [sentences[i] for i in filtered_indices]

    # 找到相似度最高的前 n 個句子
    top_n_indices = filtered_similarities.argsort()[-n:][::-1]
    top_n_sentences = [filtered_sentences[i] for i in top_n_indices]
    top_n_similarities = filtered_similarities[top_n_indices]
    
    # 返回排序後的 (句子, 相似度) 列表
    return sorted(list(zip(top_n_sentences, top_n_similarities)), key=lambda x: x[1], reverse=True)

In [43]:
top_sentences

{'supplier': [('Sales of food and consumables continued to lead growth, partially offset by softer sales in apparel and general merchandise items.The third quarter gross margin rate was flat to last year, despite a shift towards food and consumables.',
   'sale food consumables continued lead growth partially offset softer sale apparel general merchandise third quarter gross margin rate flat last year despite shift towards food consumables',
   2),
  ('The shift to EDLP drove an increase in food sales of 3.6%, sales of general merchandise were relatively flat and apparel was down because of the overall economic slowdown.',
   'shift edlp drove increase food sale sale general merchandise relatively flat apparel overall economic slowdown',
   2),
  ("Wal-Mart's higher returns over the last decade are unique among many retailers and many other companies.",
   'higher return last decade unique among many retailer many company',
   2),
  ('Few companies, retailers or otherwise, have the mom

In [44]:
def get_original_sentences(preprocessed_sentences, original_sentences, top_preprocessed):
    """
    根據預處理後的句子和相似度，找到對應的原始句子及其相似度。
    """
    # 創建一個字典來映射預處理後的句子到原始句子
    preprocessed_to_original = {prep: orig for prep, orig in zip(preprocessed_sentences, original_sentences)}
    
    # 直接使用 top_preprocessed 的順序來獲取原始句子
    return [(preprocessed_to_original[prep], sim) for prep, sim in top_preprocessed]

In [45]:
top_sentences["risk"][1][0] in walmart_2009_chosen_sentences

True

In [46]:
top_sentences["supplier"][1][0] in walmart_2009_chosen_sentences

True

In [47]:
for direction, sentences in top_sentences.items():
    print(f"\nTop sentences for direction '{direction}':")
    print(sentences[1][0])
    top_preprocessed = get_top_sentences(walmart_2009_sentences, sentences[1][0])

    # top_original_with_similarity = get_original_sentences(
    #     walmart_2009_preprocessed,
    #     walmart_2009_sentences,
    #     top_preprocessed
    # )
    
    for idx, (sentence, similarity) in enumerate(top_preprocessed, 1):
        print(f"Top {idx} sentence (Score {similarity:.4f}): {sentence}")


Top sentences for direction 'supplier':
The shift to EDLP drove an increase in food sales of 3.6%, sales of general merchandise were relatively flat and apparel was down because of the overall economic slowdown.
Top 1 sentence (Score 0.8053): Food sales were up, driven by the expansion of EDLP items and more effective customer communication of our lower prices.
Top 2 sentence (Score 0.7250): The shift in Every Day Low Price drove an increase in food sales of 2.1% and an increase in consumables sales of 6.7%.
Top 3 sentence (Score 0.6948): The general economic slowdown again affected spending on higher ticket general merchandise and apparel sales in the second quarter.Gross margin declined as a result of the continued rollout of EDLP.
Top 4 sentence (Score 0.6405): Sales of food and consumables continued to lead growth, partially offset by softer sales in apparel and general merchandise items.The third quarter gross margin rate was flat to last year, despite a shift towards food and co

- 找出還原的句子位置
- 找出各個種子字分別的 key words，要知道相對的排序，順序很重要
- 要找出不同構面的高分句子，了解句子來自哪個構面
- 實驗方向： 單一公司多個季度 vs. 同一產業同一季度多個公司

In [53]:
combined_keywords = []
for key in top_20_similar_words:
    combined_keywords.extend(top_20_similar_words[key])

combined_keywords = set(combined_keywords)
# 輸出結果
print(len(combined_keywords), combined_keywords)

69 {'risk', 'stock', 'delivery', 'customer', 'supply', 'builder', 'uncertainty', 'sale', 'merchandise', 'stress', 'compliance', 'consumer', 'indication', 'adjustment', 'client', 'safety', 'commodity', 'turnover', 'potential', 'configuration', 'wholesale', 'item', 'rental', 'employee', 'market', 'supplier', 'liability', 'payroll', 'imported', 'protection', 'collection', 'retailer', 'offer', 'area', 'company', 'apparel', 'vendor', 'mvc', 'portfolio', 'fixture', 'purpose', 'provider', 'likelihood', 'luggage', 'connection', 'warehouse', 'pricing', 'accessory', 'destination', 'situation', 'upgrade', 'relevancy', 'incorporates', 'affecting', 'hindrance', 'concern', 'catalog', 'comp', 'outcome', 'exposure', 'affect', 'inventory', 'otc', 'prescription', 'register', 'assortment', 'information', 'behavior', 'monitoring'}


In [58]:
def get_top_sentences_with_seed_words(preprocessed_sentences, original_sentences, key_words_set, top_n=5):
    # 初始化一個列表來儲存每個句子中的關鍵詞計數
    sentence_scores = []

    for original, preprocessed in zip(original_sentences, preprocessed_sentences):
        # 計算該句子中出現的關鍵詞數量
        count, found_words = count_key_words(preprocessed, key_words_set)
        # 保存原始句子，前處理句子，關鍵詞出現次數
        sentence_scores.append((original, preprocessed, count, found_words))

    # 按關鍵詞數量對句子進行排序，取前 top_n 個
    top_sentences = sorted(sentence_scores, key=lambda x: x[2], reverse=True)[:top_n]
    
    return top_sentences

def count_key_words(sentence, key_words_set):
    """
    計算句子中關鍵詞的出現次數，並返回找到的關鍵詞列表。
    """
    words = sentence.split()  # 將句子拆解為單詞列表
    found_words = [word for word in words if word in key_words_set]  # 找到句子中的關鍵詞
    return len(found_words), found_words  # 返回關鍵詞數量和找到的關鍵詞

for sentence in top_sentences:
    print(f"Score {sentence[2]}: {sentence[0]}\nFound Words: {sentence[3]}\n")

Score 20: These statements discuss, among other things, the anticipated comparable store sales for our Wal-Mart U.S. segment and for our Sam's Club segment for the current 13-week period; our anticipated diluted earnings per share from continuing operations for the current fiscal quarter and fiscal year 2010 as a whole; that such guidance is based on the assumption that currency exchange rates will remain constant at current levels; our anticipated tax rate for fiscal year 2010 and the factors that will affect that tax rate; the amount of the company's expected capital expenditures for fiscal year 2010; Wal-Mart improving its SG&A leverage and SG&A growing less than sales; customers finding value at Wal-Mart U.S., Sam's Club and the other Wal-Mart stores around the world; Wal-Mart's management not anticipating headwinds from currency exchange rates in the fourth quarter of fiscal year 2010 if all currencies remain constant against the U.S. dollar; management's expectations that Wal-Mar

In [61]:
def get_top_sentences_with_seed_words(preprocessed_sentences, original_sentences, key_word_dict, top_n=5):
    # 初始化一個列表來儲存每個句子中的關鍵詞計數
    sentence_scores = []

    # 將 key_word_dict 中的詞轉換成 set
    key_words_set = set(word for words in key_word_dict.values() for word in words)

    for original, preprocessed in zip(original_sentences, preprocessed_sentences):
        # 計算該句子中出現的關鍵詞數量
        count, found_words = count_key_words(preprocessed, key_words_set)
        # 追蹤這些找到的關鍵詞來自哪個類別
        category_counts = count_word_categories(found_words, key_word_dict)
        # 保存原始句子，前處理句子，關鍵詞出現次數，以及每個類別的關鍵詞次數
        sentence_scores.append((original, preprocessed, count, found_words, category_counts))

    # 按關鍵詞數量對句子進行排序，取前 top_n 個
    top_sentences = sorted(sentence_scores, key=lambda x: x[2], reverse=True)[:top_n]
    
    return top_sentences

def count_key_words(sentence, key_words_set):
    """
    計算句子中關鍵詞的出現次數，並返回找到的關鍵詞列表。
    """
    words = sentence.split()  # 將句子拆解為單詞列表
    found_words = [word for word in words if word in key_words_set]  # 找到句子中的關鍵詞
    return len(found_words), found_words  # 返回關鍵詞數量和找到的關鍵詞

def count_word_categories(found_words, key_word_dict):
    """
    計算找到的關鍵詞分別來自於哪個類別，返回每個類別的出現次數。
    """
    category_counts = {category: 0 for category in key_word_dict.keys()}
    
    for word in found_words:
        for category, words in key_word_dict.items():
            if word in words:
                category_counts[category] += 1
    
    return category_counts

top_sentences = get_top_sentences_with_seed_words(walmart_2009_chosen_preprocessed, walmart_2009_chosen_sentences, top_20_similar_words)

for sentence in top_sentences:
    print(f"""
        Score {sentence[2]}: {sentence[0]}
        Found Words: {sentence[3]}
        Category Counts: {sentence[4]}
    """)



        Score 20: These statements discuss, among other things, the anticipated comparable store sales for our Wal-Mart U.S. segment and for our Sam's Club segment for the current 13-week period; our anticipated diluted earnings per share from continuing operations for the current fiscal quarter and fiscal year 2010 as a whole; that such guidance is based on the assumption that currency exchange rates will remain constant at current levels; our anticipated tax rate for fiscal year 2010 and the factors that will affect that tax rate; the amount of the company's expected capital expenditures for fiscal year 2010; Wal-Mart improving its SG&A leverage and SG&A growing less than sales; customers finding value at Wal-Mart U.S., Sam's Club and the other Wal-Mart stores around the world; Wal-Mart's management not anticipating headwinds from currency exchange rates in the fourth quarter of fiscal year 2010 if all currencies remain constant against the U.S. dollar; management's expectations tha