In [147]:
import os
import pandas as pd

## Reading dataset

In [148]:
path = "archive/20news-19997/20_newsgroups"
df = []

for folder in os.listdir(path):
    for file in os.listdir(f'{path}/{folder}'):
        f = open(f'{path}/{folder}/{file}', 'rb')
        df.append(f.read())

In [149]:
df = pd.DataFrame(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19997 entries, 0 to 19996
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       19997 non-null  object
dtypes: object(1)
memory usage: 156.4+ KB


In [150]:
df.head()

Unnamed: 0,0
0,"b""Newsgroups: sci.space\nPath: cantaloupe.srv...."
1,b'Newsgroups: sci.space\nPath: cantaloupe.srv....
2,"b""Newsgroups: sci.space\nPath: cantaloupe.srv...."
3,b'Newsgroups: sci.space\nPath: cantaloupe.srv....
4,b'Newsgroups: sci.space\nPath: cantaloupe.srv....


In [151]:
df[0][0]

b"Newsgroups: sci.space\nPath: cantaloupe.srv.cs.cmu.edu!rochester!udel!bogus.sura.net!news-feed-1.peachnet.edu!gatech!swrinde!sdd.hp.com!ux1.cso.uiuc.edu!news.cso.uiuc.edu!uxa.cso.uiuc.edu!gfk39017\nFrom: gfk39017@uxa.cso.uiuc.edu (George F. Krumins)\nSubject: Re: space news from Feb 15 AW&ST\nDate: Fri, 23 Apr 1993 20:16:24 GMT\nMessage-ID: <C5yDnC.GwB@news.cso.uiuc.edu>\nReferences: <C5ros0.uy@zoo.toronto.edu> <1993Apr23.155313.4220@dazixco.ingr.com>\nSender: usenet@news.cso.uiuc.edu (Net Noise owner)\nOrganization: University of Illinois at Urbana\nLines: 23\n\njbreed@doink.b23b.ingr.com (James B. Reed) writes:\n\n>In article <C5ros0.uy@zoo.toronto.edu>, henry@zoo.toronto.edu (Henry Spencer) writes:\n>|> [Pluto's] atmosphere will start to freeze out around 2010, and after about\n>|> 2005 increasing areas of both Pluto and Charon will be in permanent\n>|> shadow that will make imaging and geochemical mapping impossible.\n\nIt's my understanding that the freezing will start to occur 

## Converting bytes to string

In [152]:
def bytes_to_str(text):
    return text.decode('utf-8', errors = 'ignore')
df[0]=df[0].apply(bytes_to_str)

## Removing headers

In [153]:
def remove_header(text):
    split_text = text.split('\n\n', 1)
    if len(split_text) > 1:
        return split_text[1]
    return text
df[0] = df[0].apply(remove_header)

In [154]:
df[0][127]

'pbd@runyon.cim.cdc.com (Paul Dokas) writes:\n\n>I was reading Popular Science this morning and was surprised by an ad in\n>the back.  I know that a lot of the ads in the back of PS are fringe\n>science or questionablely legal, but this one really grabbed my attention.\n>It was from a company name "Personal Missle, Inc." or something like that.\n\nThe company was probably "Public Missiles, Inc" of Michigan.\n\n>Anyhow, the ad stated that they\'d sell rockets that were up to 20\' in length\n>and engines of sizes "F" to "M".  They also said that some rockets will\n>reach 50,000 feet.\n\nYup.\n\n>Now, aside from the obvious dangers to any amateur rocketeer using one\n>of these beasts, isn\'t this illegal?  I can\'t imagine the FAA allowing\n>people to shoot rockets up through the flight levels of passenger planes.\n>Not to even mention the problem of locating a rocket when it comes down.\n\nNope, it\'s not illegal. It is, however, closely regulated. In order to \npurchase and use the big 

## Dividing into build and valid datasets

In [155]:
from sklearn.model_selection import train_test_split

print(df.shape)

X_build, X_val = train_test_split(
    df,
    test_size=0.3, random_state=213)

print("X_build shape: {}".format(X_build.shape))
print("X_val shape: {}".format(X_val.shape))

(19997, 1)
X_build shape: (13997, 1)
X_val shape: (6000, 1)


In [156]:
original_df = df.copy()
df = X_build
df[0] = df[0].astype(str)

In [157]:
df.columns = ["text"]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13997 entries, 12727 to 19755
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    13997 non-null  object
dtypes: object(1)
memory usage: 218.7+ KB


### Counting sentences

In [158]:
from nltk.tokenize import sent_tokenize
df["sentences"]=df["text"].apply(sent_tokenize).apply(len)

### Counting question and exclamation marks

In [159]:
df["question_marks"] = df["text"].str.count("\?")
df["exclamation_marks"] = df["text"].str.count("\!")

### Counting links

In [160]:
df["links"] = df["text"].str.count(r'https?://\S+|www\.\S+')

### Counting e-mail addresses

In [161]:
df["emails"] = df["text"].str.count(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')

## Cleaning

In [162]:
import string

def clean(df):
    df["text"] = df['text'].str.replace('https?://\S+|www\.\S+', '', regex = True) # links
    df["text"] = df["text"].str.replace(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', '', regex = True) # e-mail adresses
    df["text"] = df['text'].str.replace('<.*?>+', '', regex = True) # tags
    df["text"] = df['text'].str.replace('\[.*?\]', '', regex = True) # square brackets
    df["text"] = df['text'].str.replace('[%s]' % re.escape(string.punctuation), '', regex = True) # puntuation marks
    df["text"] = df['text'].str.replace('\n', ' ', regex = True) # newline character
    df["text"] = df['text'].str.replace("\\W"," ", regex = True) # non-alphanumeric characters
    df["text"] = df['text'].str.replace('\w*\d\w*', '', regex = True) # numeric sequences
    df["text"] = df['text'].str.replace(r'\b\w\b', '', regex = True) # one-letter words
    df["text"] = df['text'].str.replace(r'\s+', ' ', regex = True) # empty spaces
    return df

df = clean(df)
df.head()

Unnamed: 0,text,sentences,question_marks,exclamation_marks,links,emails
12727,In article Robert Castro writes Would anyone o...,14,1,8,0,3
12958,In article Jerry Hartzler CATS writes In artic...,5,0,0,0,5
17116,try to unsubscribe from this group by sending...,4,1,0,0,1
6186,In article John Eaton writes Thats one problem...,13,3,0,0,2
6782,The key question is whether nonClipper encrypt...,16,2,0,0,1


## Converting to lowercase

### Counting capital letters

In [163]:
df["capital_letters"] = df["text"].str.count(r'[A-Z]')
df["text"] = df['text'].str.lower()

In [164]:
df.head()

Unnamed: 0,text,sentences,question_marks,exclamation_marks,links,emails,capital_letters
12727,in article robert castro writes would anyone o...,14,1,8,0,3,64
12958,in article jerry hartzler cats writes in artic...,5,0,0,0,5,24
17116,try to unsubscribe from this group by sending...,4,1,0,0,1,27
6186,in article john eaton writes thats one problem...,13,3,0,0,2,20
6782,the key question is whether nonclipper encrypt...,16,2,0,0,1,30


### CountVectorizer

In [167]:
import warnings
import re
from sklearn.feature_extraction.text import CountVectorizer

def custom_tokenizer(text):
    # Split the text into tokens using whitespace and punctuation as separators
    tokens = re.findall(r'\b\w+\b', text)

    # Convert integers to strings
    tokens = [str(token) for token in tokens]

    return tokens

# Suppress the warning
warnings.filterwarnings("ignore", category=UserWarning)

count_vectorizer = CountVectorizer(tokenizer=custom_tokenizer, stop_words = "english")
count_df = count_vectorizer.fit_transform(df["text"])
count_df

<13997x90434 sparse matrix of type '<class 'numpy.int64'>'
	with 1177488 stored elements in Compressed Sparse Row format>

### TfidfVectorizer

In [168]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words = "english", max_df=0.7)
tfidf_df = tfidf_vectorizer.fit_transform(df["text"])
tfidf_df

<13997x90434 sparse matrix of type '<class 'numpy.float64'>'
	with 1177488 stored elements in Compressed Sparse Row format>

In [169]:
count_df = pd.DataFrame(count_df.A, columns = count_vectorizer.get_feature_names_out())
tfidf_df = pd.DataFrame(tfidf_df.A, columns = tfidf_vectorizer.get_feature_names_out())
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

set()
