In [1]:
import pandas as pd
import pyarrow as py

In [2]:
df = pd.read_parquet('subset_10_2.parquet')

In [3]:
df

Unnamed: 0,identifier,collection,license,text
0,github_open_source_10_2_0,Github OpenSource,Various open source,from typing import TypeVar\nfrom ndb_adapter.n...
1,US-3607956D-A_1,USPTO,Public Domain,Process for preparing allyl chloride and metha...
2,github_open_source_10_2_1,Github OpenSource,Various open source,﻿namespace NServiceBus.Unicast.Tests\n{\n u...
3,github_open_source_10_2_2,Github OpenSource,Various open source,// Copyright 2017 The Fuchsia Authors. All rig...
4,http://www.go4expert.com/forums/final-year-pro...,Creative Commons Common Crawl,Various open licenses,hello people...how are u....i am a student of ...
...,...,...,...,...
51465,bpt6k75250333_2,French-PD-Newspapers,Public Domain,"Nos voisins de l'Est peuvent, en la circonstan..."
51466,https://openalex.org/W2582118333_1,Spanish-Science-Pile,Various open science,"Cuad. Invest. Filol., 42 (2016), 67-80. DOI: 1..."
51467,https://bibtex.github.io/person/Yongseok_Oh.html,Creative Commons Common Crawl,Various open licenses,Travelled to:\n1 × Spain\nCollaborated with:\n...
51468,US-78283607-A_3,USPTO,Public Domain,Plasmid pLR186 was derived from the commercial...


In [4]:
import re
from langdetect import detect  # for language detection

In [5]:
def clean_dataset(df):
    # 1. Remove programming language entries (Github entries)
    df = df[df['collection'] != 'Github OpenSource'].copy()
    
    # 2. Clean text content
    def clean_text(text):
        if pd.isna(text):
            return text
        # Remove special characters and extra whitespace
        text = re.sub(r'[\\/\*\n\r\t]+', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    # Apply text cleaning
    df['text'] = df['text'].apply(clean_text)
    
    # 3. Detect English language
    def is_english(text):
        try:
            if pd.isna(text):
                return False
            return detect(text) == 'en'
        except:
            return False
    
    # Add English language flag
    df['is_english'] = df['text'].apply(is_english)
    
    # 4. Filter only English content
    df_clean = df[df['is_english']].copy()
    
    # 5. Reset index
    df_clean = df_clean.reset_index(drop=True)
    
    return df_clean

In [6]:
df = pd.read_parquet('subset_10_2.parquet')  # Replace with your actual file path
cleaned_df = clean_dataset(df)

In [7]:
cleaned_df

Unnamed: 0,identifier,collection,license,text,is_english
0,US-3607956D-A_1,USPTO,Public Domain,Process for preparing allyl chloride and metha...,True
1,http://www.go4expert.com/forums/final-year-pro...,Creative Commons Common Crawl,Various open licenses,hello people...how are u....i am a student of ...,True
2,sn84022149_1878-07-25_1_2_1,US-PD-Newspapers,Public Domain,"The Daily Leader. rmmm daily, xxcxrr Monday, b...",True
3,sermons01blai_1_9,English-PD,Public Domain,"The defire of it difeovers a liberal mind, and...",True
4,http://eunis.eea.europa.eu/species/224803,Creative Commons Common Crawl,Various open licenses,Kingdom: Animalia > Phylum: Arthropoda > Class...,True
...,...,...,...,...,...
21145,https://blog.geoactivegroup.com/2018/11/connec...,Creative Commons Common Crawl,Various open licenses,Technology | Media | Telecommunications Tuesda...,True
21146,1108673_2003_2,SEC,Public Domain,Gain on Sale of Discontinued Operations. For t...,True
21147,collegeofpharmac1959coll_1,English-PD,Public Domain,COLUMBIA UNIVERSITY BULLETIN Fifty-ninth Serie...,True
21148,https://bibtex.github.io/person/Yongseok_Oh.html,Creative Commons Common Crawl,Various open licenses,Travelled to: 1 × Spain Collaborated with: E.L...,True


In [8]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\umair\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\umair\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\umair\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\umair\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\umair\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
import pandas as pd
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from textstat import textstat

def further_clean_dataset(cleaned_df):
    # Start with your already cleaned DataFrame that has 'is_english' column
    df = cleaned_df.copy()
    
    # 1. Remove very short texts
    min_chars = 100
    df = df[df['text'].str.len() > min_chars]
    
    # 2. Enhanced text cleaning
    def deep_clean_text(text):
        if pd.isna(text):
            return text
            
        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        
        # Remove email addresses
        text = re.sub(r'[\w\.-]+@[\w\.-]+', '', text)
        
        # Remove special characters but keep periods and basic punctuation
        text = re.sub(r'[^\w\s.,!?-]', ' ', text)
        
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        
        # Remove multiple periods
        text = re.sub(r'\.{2,}', '.', text)

        # Remove stop words
        stop_words = set(stopwords.words('english'))
        words = word_tokenize(text)
        words = [word for word in words if word.lower() not in stop_words]
       
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        
        return ' '.join(words).strip()
    
    # Apply further cleaning to the already cleaned text
    df['text'] = df['text'].apply(deep_clean_text)
    
    # 3. Quality filters
    def text_quality_check(text):
        if pd.isna(text):
            return False
            
        # Check number of sentences
        sentences = sent_tokenize(text)
        if len(sentences) < 2:  # Require at least 2 sentences
            return False
            
        # Check for reasonable sentence length
        avg_sent_len = sum(len(s.split()) for s in sentences) / len(sentences)
        if avg_sent_len < 5 or avg_sent_len > 100:  # Filter out very short/long sentences
            return False
            
        # Check text-to-word ratio (detect garbage text)
        words = text.split()
        if len(words) == 0:
            return False
        char_to_word_ratio = len(text) / len(words)
        if char_to_word_ratio > 20:  # Likely garbage text
            return False
            
        return True
    
    df['is_quality'] = df['text'].apply(text_quality_check)
    
    # 4. Source-based filtering
    quality_sources = ['Wikipedia', 'courtlistener', 'Caselaw_Access_Project']
    df['is_quality_source'] = df['collection'].isin(quality_sources)
    
    # 5. Create content type tags
    def get_content_type(text, source):
        if pd.isna(text):
            return 'unknown'
        
        if 'OPINION AND ORDER' in text or 'NOTICE:' in text:
            return 'legal'
        elif len(text.split('\n')) > 10:
            return 'article'
        elif len(text) < 500:
            return 'short_form'
        else:
            return 'general'
    
    df['content_type'] = df.apply(lambda x: get_content_type(x['text'], x['collection']), axis=1)
    
    # 6. Final filtering
    final_df = df[
        (df['is_quality']) & 
        (df['content_type'] != 'unknown')
    ].copy()
    
    # 7. Drop intermediate columns and reset index
    final_df = final_df.drop(['is_quality', 'is_quality_source'], axis=1)
    final_df = final_df.reset_index(drop=True)
    
    return final_df

# Usage with your existing cleaned dataset:
further_cleaned_df = further_clean_dataset(cleaned_df)

# Get statistics
def get_dataset_stats(df):
    stats = {
        'total_rows': len(df),
        'avg_text_length': df['text'].str.len().mean(),
        'content_type_distribution': df['content_type'].value_counts(),
        'source_distribution': df['collection'].value_counts().head()
    }
    return stats

# Print statistics
print("Dataset Statistics:")
print(get_dataset_stats(further_cleaned_df))

Dataset Statistics:
{'total_rows': 20626, 'avg_text_length': 11962.09934063803, 'content_type_distribution': content_type
general       18599
short_form     2027
Name: count, dtype: int64, 'source_distribution': collection
Creative Commons Common Crawl    9937
USPTO                            2857
US-PD-Newspapers                 2015
English-PD                       1704
courtlistener                    1518
Name: count, dtype: int64}


In [10]:
further_cleaned_df

Unnamed: 0,identifier,collection,license,text,is_english,content_type
0,US-3607956D-A_1,USPTO,Public Domain,Process preparing allyl chloride methallyl chl...,True,general
1,sn84022149_1878-07-25_1_2_1,US-PD-Newspapers,Public Domain,"Daily Leader . rmmm daily , xxcxrr Monday , Tn...",True,general
2,sermons01blai_1_9,English-PD,Public Domain,"defire difeovers liberal mind , con- netted ma...",True,general
3,http://eunis.eea.europa.eu/species/224803,Creative Commons Common Crawl,Various open licenses,Kingdom Animalia Phylum Arthropoda Class Arach...,True,general
4,http://www.perseus.tufts.edu/hopper/text?doc=u...,Creative Commons Common Crawl,Various open licenses,"23 faith , Moses , born , hidden three month p...",True,general
...,...,...,...,...,...,...
20621,https://blog.geoactivegroup.com/2018/11/connec...,Creative Commons Common Crawl,Various open licenses,"Technology Media Telecommunications Tuesday , ...",True,general
20622,1108673_2003_2,SEC,Public Domain,Gain Sale Discontinued Operations . year ended...,True,general
20623,collegeofpharmac1959coll_1,English-PD,Public Domain,COLUMBIA UNIVERSITY BULLETIN Fifty-ninth Serie...,True,general
20624,https://bibtex.github.io/person/Yongseok_Oh.html,Creative Commons Common Crawl,Various open licenses,Travelled 1 Spain Collaborated E.Lee D.Lee Tal...,True,short_form


In [11]:
further_cleaned_df = further_cleaned_df.drop(columns=['license'])

In [12]:
# Assuming further_cleaned_df is your DataFrame after removing the 'license' column

# Identify and remove duplicates based on the 'identifier' column
def remove_duplicates(df):
    # Check for duplicates based on the 'identifier' column
    duplicates = df[df.duplicated(subset='identifier', keep=False)]

    # Print the number of duplicates found
    print(f"Number of duplicate entries found: {len(duplicates)}")

    # Remove duplicates, keeping the first occurrence
    df_no_duplicates = df.drop_duplicates(subset='identifier', keep='first')

    return df_no_duplicates

# Remove duplicates from the further_cleaned_df DataFrame
further_cleaned_df = remove_duplicates(further_cleaned_df)

# Get statistics after removing duplicates
def get_dataset_stats(df):
    stats = {
        'total_rows': len(df),
        'avg_text_length': df['text'].str.len().mean(),
        'content_type_distribution': df['content_type'].value_counts(),
        'source_distribution': df['collection'].value_counts().head()
    }
    return stats

# Print statistics
print("Dataset Statistics after Removing Duplicates:")
print(get_dataset_stats(further_cleaned_df))

Number of duplicate entries found: 2094
Dataset Statistics after Removing Duplicates:
{'total_rows': 19147, 'avg_text_length': 12679.504360996501, 'content_type_distribution': content_type
general       17365
short_form     1782
Name: count, dtype: int64, 'source_distribution': collection
Creative Commons Common Crawl    8458
USPTO                            2857
US-PD-Newspapers                 2015
English-PD                       1704
courtlistener                    1518
Name: count, dtype: int64}


In [13]:
# Display the final DataFrame
further_cleaned_df.head(10)

Unnamed: 0,identifier,collection,text,is_english,content_type
0,US-3607956D-A_1,USPTO,Process preparing allyl chloride methallyl chl...,True,general
1,sn84022149_1878-07-25_1_2_1,US-PD-Newspapers,"Daily Leader . rmmm daily , xxcxrr Monday , Tn...",True,general
2,sermons01blai_1_9,English-PD,"defire difeovers liberal mind , con- netted ma...",True,general
3,http://eunis.eea.europa.eu/species/224803,Creative Commons Common Crawl,Kingdom Animalia Phylum Arthropoda Class Arach...,True,general
4,http://www.perseus.tufts.edu/hopper/text?doc=u...,Creative Commons Common Crawl,"23 faith , Moses , born , hidden three month p...",True,general
5,sn84026089_1876-04-19_1_1_1,US-PD-Newspapers,"I. M. SEVERNS , M.D. , WHOLESALE RETAIL DRUGGI...",True,general
6,http://ccforum.com/content/16/6/245,Creative Commons Common Crawl,Review Year review 2011 Critical Care - neuroc...,True,general
7,sn86076142_1909-04-24_1_1_1,US-PD-Newspapers,Try WEATHER top Daily Bonanza Judicious Advert...,True,general
8,sn84038306_1920-07-14_1_2_1,US-PD-Newspapers,COLESBURG Chautauqua Colesburg consist five nu...,True,general
9,sn87093029_1908-11-27_1_5_1,US-PD-Newspapers,"TIME great thing always punctual , mean Dollar...",True,general


In [14]:
def get_detailed_content_type(text, source):
    """
    Enhanced content type classification using keywords and source information.
    Returns both main category and subcategory.
    """
    if pd.isna(text):
        return 'unknown', 'unknown'
    
    # Convert to lowercase for better matching
    text_lower = text.lower()
    words = set(word_tokenize(text_lower))
    
    # Define category keywords
    categories = {
        'legal': {
            'keywords': {'court', 'law', 'judge', 'legal', 'plaintiff', 'defendant', 'ruling', 'verdict', 'opinion', 'order', 'case'},
            'subcategories': {
                'court_decision': {'opinion', 'order', 'court', 'ruling', 'judgment'},
                'legal_document': {'brief', 'motion', 'petition', 'filing'},
                'legal_analysis': {'analysis', 'commentary', 'review'}
            }
        },
        'science': {
            'keywords': {'research', 'study', 'experiment', 'scientific', 'analysis', 'data', 'methodology', 'results'},
            'subcategories': {
                'medical': {'patient', 'treatment', 'clinical', 'medical', 'health', 'disease'},
                'technology': {'software', 'computer', 'algorithm', 'system', 'technology', 'digital'},
                'environmental': {'climate', 'environmental', 'ecology', 'sustainable', 'conservation'},
                'physics': {'physics', 'quantum', 'particle', 'energy', 'matter'},
                'biology': {'biology', 'cell', 'gene', 'protein', 'organism'}
            }
        },
        'history': {
            'keywords': {'history', 'historical', 'century', 'ancient', 'era', 'period', 'war', 'revolution'},
            'subcategories': {
                'political_history': {'government', 'politics', 'revolution', 'regime', 'democracy'},
                'social_history': {'society', 'cultural', 'social', 'community'},
                'military_history': {'war', 'battle', 'military', 'army', 'conflict'},
                'economic_history': {'economy', 'trade', 'market', 'economic', 'financial'}
            }
        },
        'academic': {
            'keywords': {'university', 'academic', 'research', 'study', 'theory', 'analysis'},
            'subcategories': {
                'research_paper': {'methodology', 'findings', 'conclusion', 'abstract'},
                'thesis': {'dissertation', 'thesis', 'research', 'study'},
                'review': {'literature', 'review', 'analysis', 'critique'}
            }
        },
        'news': {
            'keywords': {'news', 'report', 'journalist', 'press', 'media', 'announcement'},
            'subcategories': {
                'current_events': {'current', 'today', 'recent', 'latest'},
                'press_release': {'announces', 'released', 'statement', 'press'},
                'news_article': {'reported', 'according', 'sources'}
            }
        }
    }
    
    # Source-based initial classification
    if source in ['courtlistener', 'Caselaw_Access_Project']:
        main_category = 'legal'
    elif source == 'Wikipedia':
        # For Wikipedia, we'll determine category based on content
        main_category = None
    else:
        main_category = None
    
    # If main_category wasn't determined by source, use content analysis
    if not main_category:
        # Count keyword matches for each category
        category_scores = {}
        for category, data in categories.items():
            keyword_matches = len(words.intersection(data['keywords']))
            category_scores[category] = keyword_matches
        
        # Get category with highest score
        if any(category_scores.values()):
            main_category = max(category_scores.items(), key=lambda x: x[1])[0]
        else:
            main_category = 'general'
    
    # Determine subcategory
    if main_category in categories:
        subcategory_scores = {}
        for subcategory, subcategory_keywords in categories[main_category]['subcategories'].items():
            matches = len(words.intersection(subcategory_keywords))
            subcategory_scores[subcategory] = matches
        
        if any(subcategory_scores.values()):
            subcategory = max(subcategory_scores.items(), key=lambda x: x[1])[0]
        else:
            subcategory = f'{main_category}_general'
    else:
        subcategory = 'general'
    
    return main_category, subcategory

def enhance_content_classification(df):
    """
    Enhance the content classification of the dataset.
    """
    # Create new columns for main category and subcategory
    df[['main_category', 'subcategory']] = pd.DataFrame(
        df.apply(lambda x: get_detailed_content_type(x['text'], x['collection']), axis=1).tolist(),
        index=df.index
    )
    
    # Additional quality metrics
    df['text_length'] = df['text'].str.len()
    df['sentence_count'] = df['text'].apply(lambda x: len(sent_tokenize(x)) if pd.notna(x) else 0)
    
    return df

# Usage example:

enhanced_df = enhance_content_classification(further_cleaned_df)

# Get classification statistics
def get_classification_stats(df):
    stats = {
        'main_category_distribution': df['main_category'].value_counts(),
        'subcategory_distribution': df['subcategory'].value_counts(),
        'avg_text_length_by_category': df.groupby('main_category')['text_length'].mean(),
        'avg_sentences_by_category': df.groupby('main_category')['sentence_count'].mean(),
        'source_category_correlation': pd.crosstab(df['collection'], df['main_category'])
    }
    return stats

stats = get_classification_stats(enhanced_df)
print("Classification Statistics:")
for key, value in stats.items():
    print(f"\n{key}:")
    print(value)



Classification Statistics:

main_category_distribution:
main_category
legal       9398
general     3032
science     2985
history     2033
news         930
academic     769
Name: count, dtype: int64

subcategory_distribution:
subcategory
court_decision       7882
general              3032
legal_general         939
technology            887
history_general       839
medical               829
science_general       668
military_history      509
legal_document        458
news_general          385
thesis                365
current_events        323
social_history        290
political_history     257
biology               253
research_paper        235
environmental         196
press_release         155
physics               152
economic_history      138
legal_analysis        119
academic_general       92
review                 77
news_article           67
Name: count, dtype: int64

avg_text_length_by_category:
main_category
academic     7049.574772
general      2609.011214
history     10663.0

In [15]:
enhanced_df

Unnamed: 0,identifier,collection,text,is_english,content_type,main_category,subcategory,text_length,sentence_count
0,US-3607956D-A_1,USPTO,Process preparing allyl chloride methallyl chl...,True,general,legal,court_decision,17290,132
1,sn84022149_1878-07-25_1_2_1,US-PD-Newspapers,"Daily Leader . rmmm daily , xxcxrr Monday , Tn...",True,general,legal,court_decision,20179,307
2,sermons01blai_1_9,English-PD,"defire difeovers liberal mind , con- netted ma...",True,general,legal,court_decision,24575,302
3,http://eunis.eea.europa.eu/species/224803,Creative Commons Common Crawl,Kingdom Animalia Phylum Arthropoda Class Arach...,True,general,legal,court_decision,1307,10
4,http://www.perseus.tufts.edu/hopper/text?doc=u...,Creative Commons Common Crawl,"23 faith , Moses , born , hidden three month p...",True,general,history,history_general,751,7
...,...,...,...,...,...,...,...,...,...
20621,https://blog.geoactivegroup.com/2018/11/connec...,Creative Commons Common Crawl,"Technology Media Telecommunications Tuesday , ...",True,general,science,technology,2245,17
20622,1108673_2003_2,SEC,Gain Sale Discontinued Operations . year ended...,True,general,legal,court_decision,33118,228
20623,collegeofpharmac1959coll_1,English-PD,COLUMBIA UNIVERSITY BULLETIN Fifty-ninth Serie...,True,general,academic,thesis,42071,602
20624,https://bibtex.github.io/person/Yongseok_Oh.html,Creative Commons Common Crawl,Travelled 1 Spain Collaborated E.Lee D.Lee Tal...,True,short_form,general,general,487,4


In [16]:
enhanced_df = enhanced_df.drop(columns=['content_type'])

In [17]:
enhanced_df

Unnamed: 0,identifier,collection,text,is_english,main_category,subcategory,text_length,sentence_count
0,US-3607956D-A_1,USPTO,Process preparing allyl chloride methallyl chl...,True,legal,court_decision,17290,132
1,sn84022149_1878-07-25_1_2_1,US-PD-Newspapers,"Daily Leader . rmmm daily , xxcxrr Monday , Tn...",True,legal,court_decision,20179,307
2,sermons01blai_1_9,English-PD,"defire difeovers liberal mind , con- netted ma...",True,legal,court_decision,24575,302
3,http://eunis.eea.europa.eu/species/224803,Creative Commons Common Crawl,Kingdom Animalia Phylum Arthropoda Class Arach...,True,legal,court_decision,1307,10
4,http://www.perseus.tufts.edu/hopper/text?doc=u...,Creative Commons Common Crawl,"23 faith , Moses , born , hidden three month p...",True,history,history_general,751,7
...,...,...,...,...,...,...,...,...
20621,https://blog.geoactivegroup.com/2018/11/connec...,Creative Commons Common Crawl,"Technology Media Telecommunications Tuesday , ...",True,science,technology,2245,17
20622,1108673_2003_2,SEC,Gain Sale Discontinued Operations . year ended...,True,legal,court_decision,33118,228
20623,collegeofpharmac1959coll_1,English-PD,COLUMBIA UNIVERSITY BULLETIN Fifty-ninth Serie...,True,academic,thesis,42071,602
20624,https://bibtex.github.io/person/Yongseok_Oh.html,Creative Commons Common Crawl,Travelled 1 Spain Collaborated E.Lee D.Lee Tal...,True,general,general,487,4


In [18]:
def count_tokens(text):
    if pd.isna(text):
        return 0
    tokens = word_tokenize(text)
    return len(tokens)

# Apply the function to the 'text' column and sum the results
total_tokens = df['text'].apply(count_tokens).sum()

print(f"Total number of tokens in the 'text' column: {total_tokens}")

Total number of tokens in the 'text' column: 124105982


In [19]:
def count_tokens(text):
    if pd.isna(text):
        return 0
    tokens = word_tokenize(text)
    return len(tokens)

# Apply the function to the 'text' column and sum the results
total_tokens = enhanced_df['text'].apply(count_tokens).sum()

print(f"Total number of tokens in the 'text' column: {total_tokens}")

Total number of tokens in the 'text' column: 39527395


In [20]:
# Optional: Save processed dataset
enhanced_df.to_parquet('processed_dataset2.parquet')