In [1]:
import pandas as pd

input_file_path = '/Users/thebekhruz/Desktop/100Days-Of-Code/100-Days-of-NLP-Odyssey/data/raw/swop_triples.csv'
output_file_path = '/Users/thebekhruz/Desktop/100Days-Of-Code/100-Days-of-NLP-Odyssey/data/intermediate/swop_triples_cleaned.csv'


# Initial Data Exploration🔍:
**Explore dataset shape, information, and NaN values.**


In [2]:
df = pd.read_csv(input_file_path, delimiter='\t', header=None, names=['doc_id', 'type', 'value'])



In [3]:
print(df.shape)

(169958, 3)


In [4]:
df.isna().value_counts()

doc_id  type   value
False   False  False    168896
               True        574
        True   True        488
Name: count, dtype: int64

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169958 entries, 0 to 169957
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   doc_id  169958 non-null  object
 1   type    169470 non-null  object
 2   value   168896 non-null  object
dtypes: object(3)
memory usage: 3.9+ MB
None


In [6]:
rows_with_nan = df[df.isna().any(axis=1)]
rows_with_nan.head(10)

Unnamed: 0,doc_id,type,value
1187,This view shows the third and final phase of t...,,
1726,The foundation stone was laid by the Chairman ...,,
2446,The same event is shown in BFP01772,,
3079,This is likely to be The Crown at Wheeler End ...,,
3282,The group was started by the Holmer Green Wome...,,
3646,59f99c46-3663-3740-ac46-2e9479a773ab,description,
3683,The man and woman on the outside are Donald an...,,
3684,Accompanying them are Gwen and Sidney Hall. He...,,
3685,They were all at a party given by Sir Francis ...,,
3970,"Although the event took place in West Wycombe,...",,


In [7]:
for column in df.columns:
    print(f"Rows with NaN in {column}:")
    print(df[df[column].isna()])


Rows with NaN in doc_id:
Empty DataFrame
Columns: [doc_id, type, value]
Index: []
Rows with NaN in type:
                                                   doc_id type value
1187    This view shows the third and final phase of t...  NaN   NaN
1726    The foundation stone was laid by the Chairman ...  NaN   NaN
2446                  The same event is shown in BFP01772  NaN   NaN
3079    This is likely to be The Crown at Wheeler End ...  NaN   NaN
3282    The group was started by the Holmer Green Wome...  NaN   NaN
...                                                   ...  ...   ...
152444  The four children in the bottom right of the i...  NaN   NaN
157305  This is one of a series of photos (BFP78468 to...  NaN   NaN
159312                                  See also RHW31136  NaN   NaN
159542                                 See also RHW31138.  NaN   NaN
159586                                 See also RHW31138.  NaN   NaN

[488 rows x 3 columns]
Rows with NaN in value:
                   

# Identify and Handle Missing Values🚫

In [8]:
# First, remove rows where 'type' is NaN
df.dropna(subset=['type'], inplace=True)


In [9]:
# Then, replace NaN in 'value' with an empty string
df.fillna({'value': ''}, inplace=True)



In [10]:
print(df.isna().sum())


doc_id    0
type      0
value     0
dtype: int64


# Validate 'doc_id' Format ✔️

In [11]:
uuid_regex = r'^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$'

# Use str.match to check each doc_id against the UUID regex
matches_uuid_format = df['doc_id'].str.match(uuid_regex)

# You can then use this boolean series to filter the DataFrame or check which do not match
not_matching = df[~matches_uuid_format]


# Display entries that do not match the UUID format
print(not_matching)

# To get a count of how many doc_id values do not match the UUID format
not_matching_count = not_matching.shape[0]
print(f"Count of doc_id values not matching UUID format: {not_matching_count}")

                                         doc_id              type  \
29         https://www.wikidata.org/wiki/Q64116       tanc:became   
244        https://www.wikidata.org/wiki/Q19695      tanc:showing   
357     https://www.wikidata.org/wiki/Q12823105      tanc:gardens   
382      https://www.wikidata.org/wiki/Q8039639      tanc:showing   
561        https://www.wikidata.org/wiki/Q64116   tanc:surrounded   
...                                         ...               ...   
169200   https://www.wikidata.org/wiki/Q1541030         tanc:gain   
169421       https://www.wikidata.org/wiki/Q222  tanc:transferred   
169662   https://www.wikidata.org/wiki/Q5378261      tanc:dancing   
169776   https://www.wikidata.org/wiki/Q8039639      tanc:singing   
169954   https://www.wikidata.org/wiki/Q5378261      tanc:dancing   

                                         value  
29      https://www.wikidata.org/wiki/Q8034980  
244       https://www.wikidata.org/wiki/Q64116  
357     https://www.wiki

In [12]:
# Remove fields in doc_id which are not in the form of UUID
df = df[matches_uuid_format]

# Check the updated DataFrame
print(df.shape)


(168477, 3)


In [13]:
print(df['type'].unique())


['title' 'description' 'mentions']


### Export Cleaned Data

In [14]:
df.to_csv(output_file_path, sep='\t', index=False, header=None)
