In [2]:
# display the snomed file
import pandas as pd

df = pd.read_csv('../data/SNOMED_5000.csv',
                low_memory=False
                )

# 显示数据的基本信息
print("数据形状:", df.shape)
print("列名:", df.columns.tolist())

# 随机展示5行数据的完整内容
print("\n随机5行数据:")
pd.set_option('display.max_columns', None)  # 显示所有列
pd.set_option('display.width', None)  # 显示所有内容不截断
pd.set_option('display.max_colwidth', None)  # 显示每列的完整内容
print(df.sample(5))  # 使用sample方法随机抽取5行

数据形状: (5002, 11)
列名: ['concept_id', 'concept_name', 'domain_id', 'vocabulary_id', 'concept_class_id', 'standard_concept', 'concept_code', 'valid_start_date', 'valid_end_date', 'invalid_reason', 'FSN']

随机5行数据:
      concept_id                                              concept_name  \
1546    45763217                   Gastroduodenal motility analysis system   
1683    37019121       Entire muscle acting on lumbar intervertebral joint   
4645    46270562  History of small vessel disease due to diabetes mellitus   
11       4293737      Surgical margin involved by ductal carcinoma in situ   
3733    37303904                        Structure of retropharyngeal space   

               domain_id vocabulary_id   concept_class_id standard_concept  \
1546              Device        SNOMED    Physical Object                S   
1683  Spec Anatomic Site        SNOMED     Body Structure                S   
4645         Observation        SNOMED  Context-dependent              NaN   
11       

In [3]:
# 显示每列的数据类型和非空值数量
print("\n数据类型和非空值统计:")
print(df.info())

# 显示数值列的统计摘要
# print("\n数值列统计摘要:")
# print(df.describe())


数据类型和非空值统计:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5002 entries, 0 to 5001
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   concept_id        5002 non-null   int64  
 1   concept_name      5002 non-null   object 
 2   domain_id         5002 non-null   object 
 3   vocabulary_id     5002 non-null   object 
 4   concept_class_id  5002 non-null   object 
 5   standard_concept  4191 non-null   object 
 6   concept_code      5002 non-null   int64  
 7   valid_start_date  5002 non-null   int64  
 8   valid_end_date    5002 non-null   int64  
 9   invalid_reason    0 non-null      float64
 10  FSN               4499 non-null   object 
dtypes: float64(1), int64(4), object(6)
memory usage: 430.0+ KB
None


In [4]:
# 检索包含"Headace"的概念名称
print("\n包含'Chronic back pain的概念:")
backpain_concepts = df[df['concept_name'].str.contains('Chronic back pain', case=False, na=False)]
print(f"找到 {len(backpain_concepts)} 条包含'backpain'的记录")
print(backpain_concepts[['concept_code', 'concept_name', 'domain_id', 'concept_class_id']])


包含'Chronic back pain的概念:
找到 0 条包含'backpain'的记录
Empty DataFrame
Columns: [concept_code, concept_name, domain_id, concept_class_id]
Index: []


In [5]:
# domain_id 和 concept_class_id - 展示一下列表
print("\ndomain_id 和 concept_class_id 的列表:")
print(df['domain_id'].unique())
print(df['concept_class_id'].unique())


domain_id 和 concept_class_id 的列表:
['Condition' 'Procedure' 'Observation' 'Measurement' 'Drug' 'Device'
 'Spec Anatomic Site' 'Specimen' 'Unit' 'Metadata' 'Race' 'Meas Value'
 'Type Concept' 'Provider' 'Relationship' 'Geography' 'Language' 'Route']
['Disorder' 'Procedure' 'Observable Entity' 'Clinical Drug Form'
 'Physical Object' 'Event' 'Organism' 'Clinical Finding' 'Body Structure'
 'Pharma/Biol Product' 'Morph Abnormality' 'Specimen' 'Substance'
 'Staging / Scales' 'Dose Form' 'Social Context' 'Qualifier Value'
 'Context-dependent' 'Clinical Drug' 'Model Comp' 'Location'
 'Record Artifact' 'Attribute' 'Navi Concept' 'Linkage Assertion'
 'Disposition' 'Physical Force' 'Namespace Concept']


In [8]:
# 检索包含"Dyspnea"的概念名称
print("\n包含'Dyspnea'的概念:")
dyspnea_concepts = df[df['concept_name'].str.contains('Dyspnea', case=False, na=False)]
print(f"找到 {len(dyspnea_concepts)} 条包含'Dyspnea'的记录")
print(dyspnea_concepts)
print(dyspnea_concepts[['concept_code', 'concept_name', 'FSN']])


包含'Dyspnea'的概念:
找到 1 条包含'Dyspnea'的记录
      concept_id concept_name  domain_id vocabulary_id  concept_class_id  \
5000      312437      Dyspnea  Condition        SNOMED  Clinical Finding   

     standard_concept  concept_code  valid_start_date  valid_end_date  \
5000                S     267036007          20020131        20991231   

      invalid_reason                FSN  
5000             NaN  Dyspnea (finding)  
      concept_code concept_name                FSN
5000     267036007      Dyspnea  Dyspnea (finding)


In [11]:
# 321341 - 拿这个索引号， 第321341行
print("\n检索第3212行的数据:")
print(df.iloc[3212])


检索第3212行的数据:
concept_id                                                                                                                                               36683710
concept_name                                                                                           Bupivacaine hydrochloride 2.5 mg/mL solution for injection
domain_id                                                                                                                                                    Drug
vocabulary_id                                                                                                                                              SNOMED
concept_class_id                                                                                                                                    Clinical Drug
standard_concept                                                                                                                                              NaN
concept_code  

In [14]:
# 尝试使用模糊匹配
print("\n尝试模糊匹配:")
fuzzy_matches = df[df['concept_code'].str.contains('7036', na=False)]
if len(fuzzy_matches) > 0:
    print("找到的模糊匹配:")
    print(fuzzy_matches[['concept_code', 'concept_name']])

# 检查Dyspnea相关的行
print("\n检查Dyspnea相关的行:")
dyspnea_rows = df[df['concept_name'].str.contains('Dyspnea', case=False, na=False)]
print(dyspnea_rows[['concept_code', 'concept_name']])


尝试模糊匹配:


AttributeError: Can only use .str accessor with string values!