In [None]:
#============ TASK 1 ============

import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter

# Ensure nltk punkt tokenizer is downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load CSV 
file_path = "news_sample.csv"
textpd = pd.read_csv(file_path, encoding="utf-8")

# Define the clean_text function
def clean_text(data):
    if not isinstance(data, str):  # Handle NaN values safely
        return ""
    data = data.lower()
    data = re.sub(r'\s+', " ", data)
    data = re.sub(r'\d{1,2}[./-]\d{1,2}[./-]\d{2,4}', "<DATE>", data)
    data = re.sub(r'(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec).? \d{1,2},? \d{4}', "<DATE>", data)
    data = re.sub(r'\d{4}-\d{2}-\d{2}', "<DATE>", data)
    data = re.sub(r'[\w._%+-]+@[\w.-]+\.[a-zA-Z]{2,}', "<EMAIL>", data)
    data = re.sub(r'http[s]?://[^\s]+', "<URL>", data)
    data = re.sub(r'\d+(\.\d+)?', "<NUM>", data)
    return data

#Clean all columns
columns_to_clean1 = ["id", "domain", "type", "url", "content", "title", "authors", "keywords", "meta_keywords", "meta_description", "tags", "summary"]


# Apply cleaning to each column
for col in columns_to_clean1:
    if col in textpd.columns:  # Avoid KeyError if column is missing
        textpd[col] = textpd[col].astype(str).apply(clean_text)


# Combine all cleaned text from DataFrame columns
full_text = " ".join(textpd[col].dropna().astype(str).str.cat(sep=" ") for col in columns_to_clean1 if col in textpd.columns)

# Tokenize the cleaned text
tokens1 = word_tokenize(full_text)
print("Tokens:",len(tokens1))

# Hent stopord
stop_words = set(stopwords.words('english'))

# Fjern stopord
filtered_tokens1 = [w for w in tokens1 if w.lower() not in stop_words]
print("Tokens - remowed stop words:",len(filtered_tokens1))
Reductionrate_after_stemming_and_stopwords = (1-len(filtered_tokens1)/len(tokens1))*100
print("Reduction rate after remowing stop words:",Reductionrate_after_stemming_and_stopwords)

ps = PorterStemmer()
stemmed_tokens1 = [ps.stem(w) for w in filtered_tokens1]
print("Tokens - stemmed:",len(stemmed_tokens1))
Reductionrate_after_stemming = (1-len(stemmed_tokens1)/len(filtered_tokens1))*100
print("Reduction rate after remowed stopwords and stemming:",Reductionrate_after_stemming)

print("Unique tokens:",len(Counter(tokens1)))
print("Unique tokens after remowing stopwords:",len(Counter(filtered_tokens1)))
print("Unique tokens after stemming:",len(Counter(stemmed_tokens1)))



In [None]:
#============ TASK 2 ============

file_path = "995,000_rows.csv"
chunksize = 25000

# Define function to tokenize, remove stopwords, and stem
def tokenize_and_stem(text):
    tokens = word_tokenize(text)
    filtered_tokens = [ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

# List of columns to clean (with clean_text)
columns_to_clean = [
    "id", "domain", "type", "url", "content", "title", "authors", "keywords", 
    "meta_keywords", "meta_description", "tags", "summary"
]

# Create empty DataFrame to collect processed chunks
preprocessed_data = []

# Process chunks
for chunk_number, chunk in enumerate(pd.read_csv(file_path, chunksize=chunksize, low_memory=False)):
    # Apply cleaning function to specified columns
    for col in columns_to_clean:
        if col in chunk.columns:
            chunk[col] = chunk[col].apply(clean_text)
    
    # Remove stopwords, tokenize, and stem only for 'content' column
    if 'content' in chunk.columns:
        chunk['content'] = chunk['content'].astype(str).apply(tokenize_and_stem)
    
    preprocessed_data.append(chunk)
    # Print progress
    print(f"Processed chunk {chunk_number + 1}")

# Combine and save all cleaned data
final_df = pd.concat(preprocessed_data, ignore_index=True)
final_df.to_csv("cleaned_file.csv", index=False)





In [None]:
#============ TASK 3 ============

import pandas as pd
import re
from collections import Counter
import matplotlib.pyplot as plt
import ast

cleaned_file = "cleaned_file.csv"
chunksize = 25000  
cleaned_file_csv = pd.read_csv("cleaned_file.csv")
cleaned_file_csv.head(5)

# Columns to analyze for missing metadata
metadata_cols = ['authors', 'meta_keywords', 'meta_description', 'tags', 'summary']

# Accumulators for initial observations
total_rows = 0
missing_counts_acc = None
domain_counts_acc = {}
type_counts_acc = {}
author_counts_acc = {}
error_count_acc = 0
content_lengths = []

# Accumulators for content analysis
total_urls = 0
total_dates = 0
total_emails = 0
word_counter = Counter()

# Process the CSV in chunks
for chunk in pd.read_csv(cleaned_file, chunksize=chunksize, low_memory=False):
    total_rows += len(chunk)
    
    # Observation 1: Missing values for metadata columns 
    chunk_missing = chunk[metadata_cols].isnull().sum()
    if missing_counts_acc is None:
        missing_counts_acc = chunk_missing
    else:
        missing_counts_acc += chunk_missing

    # Observation 2: Domain distribution 
    chunk_domain_counts = chunk['domain'].value_counts()
    for domain, count in chunk_domain_counts.items():
        domain_counts_acc[domain] = domain_counts_acc.get(domain, 0) + count

    # Observation 3: Type distribution
    chunk_type_counts = chunk["type"].value_counts()
    for type_, count in chunk_type_counts.items():
        type_counts_acc[type_] = type_counts_acc.get(type_, 0) + count

    # Observation 4: Author distribution
    chunk_author_counts = chunk["authors"].value_counts()
    for author, count in chunk_author_counts.items():
        author_counts_acc[author] = author_counts_acc.get(author, 0) + count

    # Observation 5: Content Artifacts and Anomalies
    chunk['content'] = chunk['content'].astype(str)
    # Detect rows containing "error"
    error_mask = chunk['content'].str.contains(r"\berror\b", case=False, regex=True, na=False)
    error_count_acc += error_mask.sum()
    
    # Counting URLs, dates, numeric values, and aggregating words
    total_urls += chunk['content'].str.count('<URL>').sum()
    total_dates += chunk['content'].str.count('<DATE>').sum()
    total_emails += chunk['content'].str.count('<EMAIL>').sum()
    word_regex = r'\w+'
    
    # Aggregate word counts for the top frequent words
    combined_text = " ".join(chunk['content'].tolist())
    words = re.findall(word_regex, combined_text)
    word_counter.update(words)

# Results after processing
print("Total rows processed:", total_rows)

print("\nMissing values in metadata columns (count):")
print(missing_counts_acc)
print("\nMissing values in metadata columns (percentage):")
print((missing_counts_acc / total_rows * 100).round(2))

print("\nDomain distribution (top 10):")
domain_series = pd.Series(domain_counts_acc).sort_values(ascending=False)
print(domain_series.head(10))

print("\nType distribution (top 10):")
type_series = pd.Series(type_counts_acc).sort_values(ascending=False)
print(type_series.head(10))

print("\nAuthor distribution (top 10):")
author_series = pd.Series(author_counts_acc).sort_values(ascending=False)
print(author_series.head(10))

# Top 100 most frequent words
top_100_words = word_counter.most_common(100)
print("\nTop 100 most frequent words:")
for word, count in top_100_words:
    print(f"{word}: {count}")

print("\nTotal articles with 'error' in content:", error_count_acc)

# Additional content analysis results
print("\nTotal URLs found in content:", total_urls)
print("Total dates found in content:", total_dates)
print("Total total_emails found in content:", total_emails)

# Plot frequency of the top 10,000 most frequent words
top_10000 = word_counter.most_common(10000)
if top_10000:
    ranks = range(1, len(top_10000) + 1)
    frequencies = [freq for word, freq in top_10000]
    
    plt.figure(figsize=(10, 6))
    plt.plot(ranks, frequencies)
    plt.xlabel('Rank')
    plt.ylabel('Frequency')
    plt.title('Frequency of the Top 10,000 Words')
    plt.xscale('log')
    plt.yscale('log')
    plt.grid(True, which="both", ls="--")
    plt.show()
    
else:
    print("Not enough words to plot the top 10,000 frequencies.")

In [None]:
#============ TASK 3.1 ============
""" 
Top-ord in "content" distribution barplot
"""
cleaned_word_counter = Counter()

# Læs tokens og saml frekvenser
for chunk in pd.read_csv(cleaned_file, chunksize=chunksize, low_memory=False):
    if 'content' in chunk.columns:
        # Konverter string-repræsentation af lister til faktiske lister
        chunk['content'] = chunk['content'].dropna().apply(ast.literal_eval)

        for content in chunk['content']:
            if isinstance(content, list):
                cleaned_word_counter.update(content)

# Eksempeldata
top_100 = cleaned_word_counter.most_common(100)
words, freqs = zip(*top_100)

plt.figure(figsize=(10, 20))
plt.barh(words, freqs)
plt.xlabel("Frekvens")
plt.ylabel("Ord")
plt.title("Top 100 mest hyppige ord")
plt.gca().invert_yaxis() 
plt.tight_layout()
plt.show()

In [None]:
#============ TASK 3.2 ============
"""
Top-ord in "content" distribution barplot for the "raw" file 
"""
raw_word_counter = Counter()

for chunk in pd.read_csv(file_path, chunksize=chunksize, low_memory=False):
    if 'content' in chunk.columns:
        for text in chunk['content'].dropna().astype(str):
            tokens = re.findall(r'\w+', text.lower())  # enkel tokenisering
            raw_word_counter.update(tokens)

top_100 = raw_word_counter.most_common(100)
words, freqs = zip(*top_100)

plt.figure(figsize=(10, 20))
plt.barh(words, freqs)
plt.xlabel("Frekvens")
plt.ylabel("Ord")
plt.title("Top 100 mest hyppige ord (før rensning)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
#============ TASK 3.3 ============
""" 
Top-ord in "content" according to "type" distribution barplot
"""
from collections import defaultdict, Counter
type_word_counters = defaultdict(Counter)

# Læs i chunks og saml ord pr. type
for chunk in pd.read_csv(cleaned_file, chunksize=chunksize, low_memory=False):
    if 'type' in chunk.columns and 'content' in chunk.columns:
        chunk = chunk.dropna(subset=['type', 'content'])

        # Konverter content til faktiske Python-lister, hvis det er gemt som tekst
        chunk['content'] = chunk['content'].apply(ast.literal_eval)

        for _, row in chunk.iterrows():
            t = row['type']
            tokens = row['content']
            if isinstance(tokens, list):
                type_word_counters[t].update(tokens)

# Plot top 10 ord pr. type
for t, counter in type_word_counters.items():
    top_words = counter.most_common(10)
    words, freqs = zip(*top_words)

    plt.figure(figsize=(8, 5))
    plt.barh(words, freqs)
    plt.xlabel("Frekvens")
    plt.title(f"Top 10 hyppige ord i type '{t}'")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

In [None]:
#============ TASK 4 ============

from sklearn.model_selection import train_test_split
#Split dataset in 80% train and 10% test and 10% validation
X_train, X_rest, y_train, y_rest = train_test_split(cleaned_file, cleaned_file['type'] ,test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_rest, y_rest, test_size=0.5, random_state=42)


#Check correct split
print("Training data shape:", len(X_train))
print("Validation data shape:", len(X_val))
print("Testing data shape:", len(X_test))


#ave to induvidual pdf
X_train.to_csv("X_train.csv", index=False)
X_val.to_csv("X_val.csv", index=False)
X_test.to_csv("X_test.csv", index=False)