In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet('subset_10_1.parquet')

In [3]:
df

Unnamed: 0,identifier,collection,license,text
0,http://www.abs.gov.au/AUSSTATS/abs@.nsf/allpri...,Creative Commons Common Crawl,Various open licenses,Australian Bureau of Statistics\nCelebrating t...
1,github_open_source_10_1_0,Github OpenSource,Various open source,"/*\n * Copyright 2014 Red Hat, Inc.\n *\n * Li..."
2,github_open_source_10_1_1,Github OpenSource,Various open source,<?php namespace Kernel;\n\n/**\n * Class Error...
3,github_open_source_10_1_2,Github OpenSource,Various open source,﻿using ESFA.DC.ILR.ValidationService.Data.Exte...
4,github_open_source_10_1_3,Github OpenSource,Various open source,mod action;\nmod app;\nmod common;\nmod dots;\...
...,...,...,...,...
51371,github_open_source_10_1_20428,Github OpenSource,Various open source,import time\nimport board\nimport busio\n\n#fo...
51372,github_open_source_10_1_20429,Github OpenSource,Various open source,# STOP AND CLEAN ALL DOCKER CONTAINERS\ndocker...
51373,https://www.omicsonline.org/singapore/rhabdomy...,Creative Commons Common Crawl,Various open licenses,Rhabdomyosarcoma | Singapore| PDF | PPT| Case ...
51374,github_open_source_10_1_20430,Github OpenSource,Various open source,<?php\n/*\n * MindTouch Deki - enterprise coll...


In [4]:
import re
from langdetect import detect  # for language detection

In [5]:
def clean_dataset(df):
    # Remove programming language entries (Github entries)
    df = df[df['collection'] != 'Github OpenSource'].copy()
    
    def clean_text(text):
        if pd.isna(text):
            return text
        # Remove special characters and extra whitespace
        text = re.sub(r'[\\/\*\n\r\t]+', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    # Apply text cleaning
    df['text'] = df['text'].apply(clean_text)
    
    def is_english(text):
        try:
            if pd.isna(text):
                return False
            return detect(text) == 'en'
        except:
            return False
    
    df['is_english'] = df['text'].apply(is_english)
    df_clean = df[df['is_english']].copy()
    
    df_clean = df_clean.reset_index(drop=True)
    
    return df_clean

In [6]:
df = pd.read_parquet('subset_10_1.parquet')
cleaned_df = clean_dataset(df)

In [7]:
cleaned_df

Unnamed: 0,identifier,collection,license,text,is_english
0,http://www.abs.gov.au/AUSSTATS/abs@.nsf/allpri...,Creative Commons Common Crawl,Various open licenses,Australian Bureau of Statistics Celebrating th...,True
1,69602184_1,Wikipedia,CC-By-SA,The 2022 Bendigo International was a professio...,True
2,http://www.go4expert.com/community/hello-every...,Creative Commons Common Crawl,Various open licenses,"""Hello Everyone"" Newbie Member Hello to everym...",True
3,9841122_1,courtlistener,Public Domain,NOTICE: All slip opinions and orders are subje...,True
4,497446_1,Caselaw_Access_Project,Public Domain,"OPINION AND ORDER LETTOW, Judge. This ease inv...",True
...,...,...,...,...,...
20924,US-61949275-A_1,USPTO,Public Domain,"Construction for uniting the brake case cover,...",True
20925,https://sunlightfoundation.com/2017/11/28/toda...,Creative Commons Common Crawl,Various open licenses,Today in OpenGov: Weaving a web of accountabil...,True
20926,6549cad947f17b93383bb3c3a62d1b34_2,French-Science-Pile,Various open science,Discussion and conclusion Civic stratification...,True
20927,5765768_1,courtlistener,Public Domain,"Present—Scudder, P.J., Hurlbutt, Fahey, Perado...",True


In [8]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\umair\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\umair\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\umair\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\umair\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\umair\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords , wordnet
from nltk.stem import WordNetLemmatizer

def further_clean_dataset(cleaned_df):
    df = cleaned_df.copy()
    
    min_chars = 100
    df = df[df['text'].str.len() > min_chars]
    
    def deep_clean_text(text):
        if pd.isna(text):
            return text
            
        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        
        # Remove email addresses
        text = re.sub(r'[\w\.-]+@[\w\.-]+', '', text)
        
        # Remove special characters but keep periods and basic punctuation
        text = re.sub(r'[^\w\s.,!?-]', ' ', text)
        
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        
        # Remove multiple periods
        text = re.sub(r'\.{2,}', '.', text)

        # Remove stop words
        stop_words = set(stopwords.words('english'))
        words = word_tokenize(text)
        words = [word for word in words if word.lower() not in stop_words]
       
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        
        return ' '.join(words).strip()
    
    # Apply further cleaning to the already cleaned text
    df['text'] = df['text'].apply(deep_clean_text)
    
    # 3. Quality filters
    def text_quality_check(text):
        if pd.isna(text):
            return False
            
        # Check number of sentences
        sentences = sent_tokenize(text)
        if len(sentences) < 2:  # Require at least 2 sentences
            return False
            
        # Check for reasonable sentence length
        avg_sent_len = sum(len(s.split()) for s in sentences) / len(sentences)
        if avg_sent_len < 5 or avg_sent_len > 100:  # Filter out very short/long sentences
            return False
            
        # Checking text-to-word ratio for detecting garbage text
        words = text.split()
        if len(words) == 0:
            return False
        char_to_word_ratio = len(text) / len(words)
        if char_to_word_ratio > 20:  # Likely garbage text
            return False
            
        return True
    
    df['is_quality'] = df['text'].apply(text_quality_check)
    
    quality_sources = ['Wikipedia', 'courtlistener', 'Caselaw_Access_Project']
    df['is_quality_source'] = df['collection'].isin(quality_sources)
    
    def get_content_type(text, source):
        if pd.isna(text):
            return 'unknown'
        
        if 'OPINION AND ORDER' in text or 'NOTICE:' in text:
            return 'legal'
        elif len(text.split('\n')) > 10:
            return 'article'
        elif len(text) < 500:
            return 'short_form'
        else:
            return 'general'
    
    df['content_type'] = df.apply(lambda x: get_content_type(x['text'], x['collection']), axis=1)
    
    # 6. Final filtering
    final_df = df[
        (df['is_quality']) & 
        (df['content_type'] != 'unknown')
    ].copy()
    
    # 7. Drop intermediate columns and reset index
    final_df = final_df.drop(['is_quality', 'is_quality_source'], axis=1)
    final_df = final_df.reset_index(drop=True)
    
    return final_df

further_cleaned_df = further_clean_dataset(cleaned_df)

# Get statistics
def get_dataset_stats(df):
    stats = {
        'total_rows': len(df),
        'avg_text_length': df['text'].str.len().mean(),
        'content_type_distribution': df['content_type'].value_counts(),
        'source_distribution': df['collection'].value_counts().head()
    }
    return stats

print("Dataset Statistics:")
print(get_dataset_stats(further_cleaned_df))

Dataset Statistics:
{'total_rows': 20407, 'avg_text_length': 11920.50149458519, 'content_type_distribution': content_type
general       18454
short_form     1953
Name: count, dtype: int64, 'source_distribution': collection
Creative Commons Common Crawl    9766
USPTO                            2845
US-PD-Newspapers                 2000
English-PD                       1658
courtlistener                    1515
Name: count, dtype: int64}


In [10]:
further_cleaned_df

Unnamed: 0,identifier,collection,license,text,is_english,content_type
0,http://www.abs.gov.au/AUSSTATS/abs@.nsf/allpri...,Creative Commons Common Crawl,Various open licenses,Australian Bureau Statistics Celebrating Inter...,True,general
1,69602184_1,Wikipedia,CC-By-SA,2022 Bendigo International professional tennis...,True,general
2,http://www.go4expert.com/community/hello-every...,Creative Commons Common Crawl,Various open licenses,Hello Everyone Newbie Member Hello everymember...,True,general
3,9841122_1,courtlistener,Public Domain,NOTICE slip opinion order subject formal revis...,True,general
4,497446_1,Caselaw_Access_Project,Public Domain,"OPINION ORDER LETTOW , Judge . ease involves c...",True,general
...,...,...,...,...,...,...
20402,Evaluation of the intergovernmental committees...,European Open Data,Various open data,EVALUATION INTERGOVERNMENTAL COMMITTEES Final ...,True,general
20403,US-61949275-A_1,USPTO,Public Domain,"Construction uniting brake case cover , rear w...",True,general
20404,https://sunlightfoundation.com/2017/11/28/toda...,Creative Commons Common Crawl,Various open licenses,Today OpenGov Weaving web accountability today...,True,general
20405,6549cad947f17b93383bb3c3a62d1b34_2,French-Science-Pile,Various open science,Discussion conclusion Civic stratification pos...,True,general


In [11]:
further_cleaned_df = further_cleaned_df.drop(columns=['license'])

In [12]:
further_cleaned_df

Unnamed: 0,identifier,collection,text,is_english,content_type
0,http://www.abs.gov.au/AUSSTATS/abs@.nsf/allpri...,Creative Commons Common Crawl,Australian Bureau Statistics Celebrating Inter...,True,general
1,69602184_1,Wikipedia,2022 Bendigo International professional tennis...,True,general
2,http://www.go4expert.com/community/hello-every...,Creative Commons Common Crawl,Hello Everyone Newbie Member Hello everymember...,True,general
3,9841122_1,courtlistener,NOTICE slip opinion order subject formal revis...,True,general
4,497446_1,Caselaw_Access_Project,"OPINION ORDER LETTOW , Judge . ease involves c...",True,general
...,...,...,...,...,...
20402,Evaluation of the intergovernmental committees...,European Open Data,EVALUATION INTERGOVERNMENTAL COMMITTEES Final ...,True,general
20403,US-61949275-A_1,USPTO,"Construction uniting brake case cover , rear w...",True,general
20404,https://sunlightfoundation.com/2017/11/28/toda...,Creative Commons Common Crawl,Today OpenGov Weaving web accountability today...,True,general
20405,6549cad947f17b93383bb3c3a62d1b34_2,French-Science-Pile,Discussion conclusion Civic stratification pos...,True,general


In [13]:
# Identify and remove duplicates based on the 'identifier' column
def remove_duplicates(df):
    # Check for duplicates based on the 'identifier' column
    duplicates = df[df.duplicated(subset='identifier', keep=False)]

    # Print the number of duplicates found
    print(f"Number of duplicate entries found: {len(duplicates)}")

    # Remove duplicates, keeping the first occurrence
    df_no_duplicates = df.drop_duplicates(subset='identifier', keep='first')

    return df_no_duplicates

# Remove duplicates from the further_cleaned_df DataFrame
further_cleaned_df = remove_duplicates(further_cleaned_df)

# Get statistics after removing duplicates
def get_dataset_stats(df):
    stats = {
        'total_rows': len(df),
        'avg_text_length': df['text'].str.len().mean(),
        'content_type_distribution': df['content_type'].value_counts(),
        'source_distribution': df['collection'].value_counts().head()
    }
    return stats

# Print statistics
print("Dataset Statistics after Removing Duplicates:")
print(get_dataset_stats(further_cleaned_df))

Number of duplicate entries found: 2023
Dataset Statistics after Removing Duplicates:
{'total_rows': 18987, 'avg_text_length': 12643.090377626797, 'content_type_distribution': content_type
general       17270
short_form     1717
Name: count, dtype: int64, 'source_distribution': collection
Creative Commons Common Crawl    8346
USPTO                            2845
US-PD-Newspapers                 2000
English-PD                       1658
courtlistener                    1515
Name: count, dtype: int64}


In [15]:
further_cleaned_df.head(10)

Unnamed: 0,identifier,collection,text,is_english,content_type
0,http://www.abs.gov.au/AUSSTATS/abs@.nsf/allpri...,Creative Commons Common Crawl,Australian Bureau Statistics Celebrating Inter...,True,general
1,69602184_1,Wikipedia,2022 Bendigo International professional tennis...,True,general
2,http://www.go4expert.com/community/hello-every...,Creative Commons Common Crawl,Hello Everyone Newbie Member Hello everymember...,True,general
3,9841122_1,courtlistener,NOTICE slip opinion order subject formal revis...,True,general
4,497446_1,Caselaw_Access_Project,"OPINION ORDER LETTOW , Judge . ease involves c...",True,general
5,https://globalvoices.org/-/world/east-asia/cam...,Creative Commons Common Crawl,"Close Support Global Voices stay independent ,...",True,general
6,sn85058397_1912-09-19_1_5_1,US-PD-Newspapers,"5 1 EVENING STANDARD , OGDEN , UTAH , THURSDAY...",True,general
7,1864957_1,courtlistener,"427 . 2d 1114 1983 STATE Florida , Appellant ,...",True,general
8,5975622_1,courtlistener,action judgment declaring respondent Governmen...,True,general
9,https://cloud.google.com/spanner/docs/instance...,Creative Commons Common Crawl,Instances page introduces concept Cloud Spanne...,True,general


In [16]:
def get_detailed_content_type(text, source):
    if pd.isna(text):
        return 'unknown', 'unknown'
    
    text_lower = text.lower()
    words = set(word_tokenize(text_lower))
    
    categories = {
        'legal': {
            'keywords': {'court', 'law', 'judge', 'legal', 'plaintiff', 'defendant', 'ruling', 'verdict', 'opinion', 'order', 'case'},
            'subcategories': {
                'court_decision': {'opinion', 'order', 'court', 'ruling', 'judgment'},
                'legal_document': {'brief', 'motion', 'petition', 'filing'},
                'legal_analysis': {'analysis', 'commentary', 'review'}
            }
        },
        'science': {
            'keywords': {'research', 'study', 'experiment', 'scientific', 'analysis', 'data', 'methodology', 'results'},
            'subcategories': {
                'medical': {'patient', 'treatment', 'clinical', 'medical', 'health', 'disease'},
                'technology': {'software', 'computer', 'algorithm', 'system', 'technology', 'digital'},
                'environmental': {'climate', 'environmental', 'ecology', 'sustainable', 'conservation'},
                'physics': {'physics', 'quantum', 'particle', 'energy', 'matter'},
                'biology': {'biology', 'cell', 'gene', 'protein', 'organism'}
            }
        },
        'history': {
            'keywords': {'history', 'historical', 'century', 'ancient', 'era', 'period', 'war', 'revolution'},
            'subcategories': {
                'political_history': {'government', 'politics', 'revolution', 'regime', 'democracy'},
                'social_history': {'society', 'cultural', 'social', 'community'},
                'military_history': {'war', 'battle', 'military', 'army', 'conflict'},
                'economic_history': {'economy', 'trade', 'market', 'economic', 'financial'}
            }
        },
        'academic': {
            'keywords': {'university', 'academic', 'research', 'study', 'theory', 'analysis'},
            'subcategories': {
                'research_paper': {'methodology', 'findings', 'conclusion', 'abstract'},
                'thesis': {'dissertation', 'thesis', 'research', 'study'},
                'review': {'literature', 'review', 'analysis', 'critique'}
            }
        },
        'news': {
            'keywords': {'news', 'report', 'journalist', 'press', 'media', 'announcement'},
            'subcategories': {
                'current_events': {'current', 'today', 'recent', 'latest'},
                'press_release': {'announces', 'released', 'statement', 'press'},
                'news_article': {'reported', 'according', 'sources'}
            }
        }
    }
    
    if source in ['courtlistener', 'Caselaw_Access_Project']:
        main_category = 'legal'
    elif source == 'Wikipedia':
        # For Wikipedia, we'll determine category based on content
        main_category = None
    else:
        main_category = None
    
    if not main_category:
        # Count keyword matches for each category
        category_scores = {}
        for category, data in categories.items():
            keyword_matches = len(words.intersection(data['keywords']))
            category_scores[category] = keyword_matches
        
        # Get category with highest score
        if any(category_scores.values()):
            main_category = max(category_scores.items(), key=lambda x: x[1])[0]
        else:
            main_category = 'general'
    
    # Determine subcategory
    if main_category in categories:
        subcategory_scores = {}
        for subcategory, subcategory_keywords in categories[main_category]['subcategories'].items():
            matches = len(words.intersection(subcategory_keywords))
            subcategory_scores[subcategory] = matches
        
        if any(subcategory_scores.values()):
            subcategory = max(subcategory_scores.items(), key=lambda x: x[1])[0]
        else:
            subcategory = f'{main_category}_general'
    else:
        subcategory = 'general'
    
    return main_category, subcategory

def enhance_content_classification(df):
    # Create new columns for main category and subcategory
    df[['main_category', 'subcategory']] = pd.DataFrame(
        df.apply(lambda x: get_detailed_content_type(x['text'], x['collection']), axis=1).tolist(),
        index=df.index
    )
    
    df['text_length'] = df['text'].str.len()
    df['sentence_count'] = df['text'].apply(lambda x: len(sent_tokenize(x)) if pd.notna(x) else 0)
    
    return df


enhanced_df = enhance_content_classification(further_cleaned_df)

# Get classification statistics
def get_classification_stats(df):
    stats = {
        'main_category_distribution': df['main_category'].value_counts(),
        'subcategory_distribution': df['subcategory'].value_counts(),
        'avg_text_length_by_category': df.groupby('main_category')['text_length'].mean(),
        'avg_sentences_by_category': df.groupby('main_category')['sentence_count'].mean(),
        'source_category_correlation': pd.crosstab(df['collection'], df['main_category'])
    }
    return stats

stats = get_classification_stats(enhanced_df)
print("Classification Statistics:")
for key, value in stats.items():
    print(f"\n{key}:")
    print(value)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['main_category', 'subcategory']] = pd.DataFrame(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['main_category', 'subcategory']] = pd.DataFrame(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_length'] = df['text'].str.len()


Classification Statistics:

main_category_distribution:
main_category
legal       9376
general     3034
science     2945
history     1936
news         940
academic     756
Name: count, dtype: int64

subcategory_distribution:
subcategory
court_decision       7896
general              3034
legal_general         920
technology            901
history_general       834
medical               788
science_general       660
military_history      477
legal_document        437
news_general          385
thesis                360
current_events        315
social_history        275
biology               275
political_history     237
research_paper        200
environmental         200
press_release         171
legal_analysis        123
physics               121
economic_history      113
academic_general       99
review                 97
news_article           69
Name: count, dtype: int64

avg_text_length_by_category:
main_category
academic     6743.111111
general      2531.368820
history      9881.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentence_count'] = df['text'].apply(lambda x: len(sent_tokenize(x)) if pd.notna(x) else 0)


In [18]:
enhanced_df

Unnamed: 0,identifier,collection,text,is_english,content_type,main_category,subcategory,text_length,sentence_count
0,http://www.abs.gov.au/AUSSTATS/abs@.nsf/allpri...,Creative Commons Common Crawl,Australian Bureau Statistics Celebrating Inter...,True,general,general,general,764,5
1,69602184_1,Wikipedia,2022 Bendigo International professional tennis...,True,general,legal,court_decision,1600,12
2,http://www.go4expert.com/community/hello-every...,Creative Commons Common Crawl,Hello Everyone Newbie Member Hello everymember...,True,general,academic,academic_general,1023,15
3,9841122_1,courtlistener,NOTICE slip opinion order subject formal revis...,True,general,legal,court_decision,33425,358
4,497446_1,Caselaw_Access_Project,"OPINION ORDER LETTOW , Judge . ease involves c...",True,general,legal,court_decision,31051,495
...,...,...,...,...,...,...,...,...,...
20402,Evaluation of the intergovernmental committees...,European Open Data,EVALUATION INTERGOVERNMENTAL COMMITTEES Final ...,True,general,legal,court_decision,36005,243
20403,US-61949275-A_1,USPTO,"Construction uniting brake case cover , rear w...",True,general,legal,court_decision,12782,104
20404,https://sunlightfoundation.com/2017/11/28/toda...,Creative Commons Common Crawl,Today OpenGov Weaving web accountability today...,True,general,legal,court_decision,5724,51
20405,6549cad947f17b93383bb3c3a62d1b34_2,French-Science-Pile,Discussion conclusion Civic stratification pos...,True,general,science,medical,24557,359


In [19]:
enhanced_df = enhanced_df.drop(columns=['content_type'])

In [20]:
enhanced_df

Unnamed: 0,identifier,collection,text,is_english,main_category,subcategory,text_length,sentence_count
0,http://www.abs.gov.au/AUSSTATS/abs@.nsf/allpri...,Creative Commons Common Crawl,Australian Bureau Statistics Celebrating Inter...,True,general,general,764,5
1,69602184_1,Wikipedia,2022 Bendigo International professional tennis...,True,legal,court_decision,1600,12
2,http://www.go4expert.com/community/hello-every...,Creative Commons Common Crawl,Hello Everyone Newbie Member Hello everymember...,True,academic,academic_general,1023,15
3,9841122_1,courtlistener,NOTICE slip opinion order subject formal revis...,True,legal,court_decision,33425,358
4,497446_1,Caselaw_Access_Project,"OPINION ORDER LETTOW , Judge . ease involves c...",True,legal,court_decision,31051,495
...,...,...,...,...,...,...,...,...
20402,Evaluation of the intergovernmental committees...,European Open Data,EVALUATION INTERGOVERNMENTAL COMMITTEES Final ...,True,legal,court_decision,36005,243
20403,US-61949275-A_1,USPTO,"Construction uniting brake case cover , rear w...",True,legal,court_decision,12782,104
20404,https://sunlightfoundation.com/2017/11/28/toda...,Creative Commons Common Crawl,Today OpenGov Weaving web accountability today...,True,legal,court_decision,5724,51
20405,6549cad947f17b93383bb3c3a62d1b34_2,French-Science-Pile,Discussion conclusion Civic stratification pos...,True,science,medical,24557,359


In [22]:
def count_tokens(text):
    if pd.isna(text):
        return 0
    tokens = word_tokenize(text)
    return len(tokens)

total_tokens = df['text'].apply(count_tokens).sum()

print(f"Total number of tokens in the 'text' column: {total_tokens}")

Total number of tokens in the 'text' column: 122766648


In [23]:
def count_tokens(text):
    if pd.isna(text):
        return 0
    tokens = word_tokenize(text)
    return len(tokens)

total_tokens = enhanced_df['text'].apply(count_tokens).sum()

print(f"Total number of tokens in the 'text' column: {total_tokens}")

Total number of tokens in the 'text' column: 38950746


In [24]:
enhanced_df.to_parquet('processed_dataset.parquet')