In [84]:
import os
import pandas as pd

## Reading dataset

In [85]:
path = "archive/20news-19997/20_newsgroups"
df = []

for folder in os.listdir(path):
    for file in os.listdir(f'{path}/{folder}'):
        f = open(f'{path}/{folder}/{file}', 'rb')
        df.append(f.read())

In [86]:
df = pd.DataFrame(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19997 entries, 0 to 19996
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       19997 non-null  object
dtypes: object(1)
memory usage: 156.4+ KB


In [87]:
df.head()

Unnamed: 0,0
0,"b""Newsgroups: sci.space\nPath: cantaloupe.srv...."
1,b'Newsgroups: sci.space\nPath: cantaloupe.srv....
2,"b""Newsgroups: sci.space\nPath: cantaloupe.srv...."
3,b'Newsgroups: sci.space\nPath: cantaloupe.srv....
4,b'Newsgroups: sci.space\nPath: cantaloupe.srv....


In [88]:
df[0][0]

b"Newsgroups: sci.space\nPath: cantaloupe.srv.cs.cmu.edu!rochester!udel!bogus.sura.net!news-feed-1.peachnet.edu!gatech!swrinde!sdd.hp.com!ux1.cso.uiuc.edu!news.cso.uiuc.edu!uxa.cso.uiuc.edu!gfk39017\nFrom: gfk39017@uxa.cso.uiuc.edu (George F. Krumins)\nSubject: Re: space news from Feb 15 AW&ST\nDate: Fri, 23 Apr 1993 20:16:24 GMT\nMessage-ID: <C5yDnC.GwB@news.cso.uiuc.edu>\nReferences: <C5ros0.uy@zoo.toronto.edu> <1993Apr23.155313.4220@dazixco.ingr.com>\nSender: usenet@news.cso.uiuc.edu (Net Noise owner)\nOrganization: University of Illinois at Urbana\nLines: 23\n\njbreed@doink.b23b.ingr.com (James B. Reed) writes:\n\n>In article <C5ros0.uy@zoo.toronto.edu>, henry@zoo.toronto.edu (Henry Spencer) writes:\n>|> [Pluto's] atmosphere will start to freeze out around 2010, and after about\n>|> 2005 increasing areas of both Pluto and Charon will be in permanent\n>|> shadow that will make imaging and geochemical mapping impossible.\n\nIt's my understanding that the freezing will start to occur 

## Converting bytes to string

In [89]:
def bytes_to_str(text):
    return text.decode('utf-8', errors = 'ignore')
df[0]=df[0].apply(bytes_to_str)

## Removing headers

In [90]:
def remove_header(text):
    split_text = text.split('\n\n', 1)
    if len(split_text) > 1:
        return split_text[1]
    return text
df[0] = df[0].apply(remove_header)

In [91]:
df[0][127]

'pbd@runyon.cim.cdc.com (Paul Dokas) writes:\n\n>I was reading Popular Science this morning and was surprised by an ad in\n>the back.  I know that a lot of the ads in the back of PS are fringe\n>science or questionablely legal, but this one really grabbed my attention.\n>It was from a company name "Personal Missle, Inc." or something like that.\n\nThe company was probably "Public Missiles, Inc" of Michigan.\n\n>Anyhow, the ad stated that they\'d sell rockets that were up to 20\' in length\n>and engines of sizes "F" to "M".  They also said that some rockets will\n>reach 50,000 feet.\n\nYup.\n\n>Now, aside from the obvious dangers to any amateur rocketeer using one\n>of these beasts, isn\'t this illegal?  I can\'t imagine the FAA allowing\n>people to shoot rockets up through the flight levels of passenger planes.\n>Not to even mention the problem of locating a rocket when it comes down.\n\nNope, it\'s not illegal. It is, however, closely regulated. In order to \npurchase and use the big 

## Dividing into build and valid datasets

In [92]:
from sklearn.model_selection import train_test_split

print(df.shape)

X_build, X_val = train_test_split(
    df,
    test_size=0.3, random_state=213)

print("X_build shape: {}".format(X_build.shape))
print("X_val shape: {}".format(X_val.shape))

(19997, 1)
X_build shape: (13997, 1)
X_val shape: (6000, 1)


In [93]:
original_df = df.copy()
df = X_build
df[0] = df[0].astype(str)

In [94]:
df.columns = ["text"]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13997 entries, 12727 to 19755
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    13997 non-null  object
dtypes: object(1)
memory usage: 218.7+ KB


### Counting sentences

In [95]:
from nltk.tokenize import sent_tokenize
df["sentences"]=df["text"].apply(sent_tokenize).apply(len)

### Counting question and exclamation marks

In [96]:
df["question_marks_ratio"] = df["text"].str.count("\?")/df["sentences"]
df["exclamation_marks_ratio"] = df["text"].str.count("\!")/df["sentences"]

### Counting links

In [97]:
df["links"] = df["text"].str.count(r'https?://\S+|www\.\S+')

### Counting e-mail addresses

In [98]:
df["emails_addresses"] = df["text"].str.count(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')

## Cleaning

In [99]:
import string
import re

def clean(df):
    df["text"] = df['text'].str.replace('https?://\S+|www\.\S+', '', regex = True) # links
    df["text"] = df["text"].str.replace(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', '', regex = True) # e-mail adresses
    df["text"] = df['text'].str.replace('<.*?>+', '', regex = True) # tags
    df["text"] = df['text'].str.replace('\[.*?\]', '', regex = True) # square brackets
    df["text"] = df['text'].str.replace('[%s]' % re.escape(string.punctuation), '', regex = True) # puntuation marks
    df["text"] = df['text'].str.replace('\n', ' ', regex = True) # newline character
    df["text"] = df['text'].str.replace("\\W"," ", regex = True) # non-alphanumeric characters
    df["text"] = df['text'].str.replace('\w*\d\w*', '', regex = True) # numeric sequences
    df["text"] = df['text'].str.replace(r'\b\w\b', '', regex = True) # one-letter words
    df["text"] = df['text'].str.replace(r'\s+', ' ', regex = True) # empty spaces
    return df

df = clean(df)
df.head()

Unnamed: 0,text,sentences,question_marks_ratio,exclamation_marks_ratio,links,emails_addresses
12727,In article Robert Castro writes Would anyone o...,14,0.071429,0.571429,0,3
12958,In article Jerry Hartzler CATS writes In artic...,5,0.0,0.0,0,5
17116,try to unsubscribe from this group by sending...,4,0.25,0.0,0,1
6186,In article John Eaton writes Thats one problem...,13,0.230769,0.0,0,2
6782,The key question is whether nonClipper encrypt...,16,0.125,0.0,0,1


## Converting to lowercase

In [100]:
df["chars"]=df["text"].str.len()
df["capital_letters_ratio"] = df["text"].str.count(r'[A-Z]')/df["text"].str.count(r'[A-za-z]')
df["text"] = df['text'].str.lower()

## Removing stopwords

In [101]:
from nltk.corpus import stopwords

def remove_stopwords(text):
 stop = set(stopwords.words('english'))
 text = [word for word in text.split() if word not in stop]
 text = ' '.join(x for x in text)
 return text

df['text'] = df['text'].apply(remove_stopwords)

## POS tagging and lemmatization

In [102]:
from nltk.corpus import wordnet
import nltk
from nltk.stem import WordNetLemmatizer

def pos_tag_and_lemmatize(text):
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)

    lemmas = []
    noun_count = 0
    adj_count = 0
    verb_count = 0
    adv_count = 0
    token_count = len(tokens)
    unique_words = len(set(tokens))

    if token_count == 0:
        return lemmas, noun_count, adj_count, verb_count, adv_count, token_count, unique_words

    lemmatizer = WordNetLemmatizer()

    for token, pos in pos_tags:
        wn_pos = nltk.corpus.wordnet.NOUN
        if pos.startswith('J'):
            wn_pos = wordnet.ADJ
            adj_count += 1
        elif pos.startswith('V'):
            wn_pos = wordnet.VERB
            verb_count += 1
        elif pos.startswith('R'):
            wn_pos = wordnet.ADV
            adv_count += 1
        else:
            noun_count += 1

        lemma = lemmatizer.lemmatize(token, pos=wn_pos)
        lemmas.append(lemma)

    noun_ratio = noun_count/token_count
    adj_ratio = adj_count/token_count
    verb_ratio = verb_count/token_count
    adv_ratio = adv_count/token_count

    lemmas = " ".join(lemmas)

    return lemmas, noun_ratio, adj_ratio, verb_ratio, adv_ratio, token_count, unique_words

# Apply POS-tagging, lemmatization, and count to the DataFrame
df[['text', 'noun_ratio', 'adj_ratio', 'verb_ratio', 'adv_ratio', 'words', 'unique_words']] = df['text'].apply(pos_tag_and_lemmatize).apply(pd.Series)

## Lexical diversity

In [103]:
df['lexical_div'] = df['words'] / df['unique_words']

## Average word length

In [104]:
df['avg_word_length'] = df['chars'] / df['words']

In [105]:
df.head()

Unnamed: 0,text,sentences,question_marks_ratio,exclamation_marks_ratio,links,emails_addresses,chars,capital_letters_ratio,noun_ratio,adj_ratio,verb_ratio,adv_ratio,words,unique_words,lexical_div,avg_word_length
12727,article robert castro writes would anyone dodl...,14,0.071429,0.571429,0,3,1335,0.058878,0.562044,0.167883,0.218978,0.051095,137,117,1.17094,9.744526
12958,article jerry hartzler cat write article micha...,5,0.0,0.0,0,5,402,0.072508,0.538462,0.153846,0.205128,0.102564,39,36,1.083333,10.307692
17116,try unsubscribe group send email doesnt work c...,4,0.25,0.0,0,1,397,0.081818,0.595238,0.238095,0.142857,0.02381,42,40,1.05,9.452381
6186,article john eaton write thats one problem far...,13,0.230769,0.0,0,2,1334,0.018051,0.583333,0.166667,0.191667,0.058333,120,109,1.100917,11.116667
6782,key question whether nonclipper encryption mak...,16,0.125,0.0,0,1,1906,0.018657,0.47929,0.224852,0.242604,0.053254,169,134,1.261194,11.278107


In [108]:
df

Unnamed: 0,text,sentences,question_marks_ratio,exclamation_marks_ratio,links,emails_addresses,chars,capital_letters_ratio,noun_ratio,adj_ratio,verb_ratio,adv_ratio,words,unique_words,lexical_div,avg_word_length
12727,article robert castro writes would anyone dodl...,14,0.071429,0.571429,0,3,1335,0.058878,0.562044,0.167883,0.218978,0.051095,137,117,1.170940,9.744526
12958,article jerry hartzler cat write article micha...,5,0.000000,0.000000,0,5,402,0.072508,0.538462,0.153846,0.205128,0.102564,39,36,1.083333,10.307692
17116,try unsubscribe group send email doesnt work c...,4,0.250000,0.000000,0,1,397,0.081818,0.595238,0.238095,0.142857,0.023810,42,40,1.050000,9.452381
6186,article john eaton write thats one problem far...,13,0.230769,0.000000,0,2,1334,0.018051,0.583333,0.166667,0.191667,0.058333,120,109,1.100917,11.116667
6782,key question whether nonclipper encryption mak...,16,0.125000,0.000000,0,1,1906,0.018657,0.479290,0.224852,0.242604,0.053254,169,134,1.261194,11.278107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11051,price seem good would like know give away secr...,5,0.000000,0.000000,0,0,258,0.110577,0.458333,0.166667,0.333333,0.041667,24,23,1.043478,10.750000
17409,get compwindowsx justin kibell system programm...,4,0.250000,0.000000,0,1,195,0.136095,0.700000,0.150000,0.150000,0.000000,20,20,1.000000,9.750000
8032,watch detroitminnesota game last night think s...,5,0.400000,0.400000,0,2,251,0.053398,0.560000,0.200000,0.240000,0.000000,25,23,1.086957,10.040000
13540,article jon livesey write article douglas grah...,17,0.058824,0.000000,0,5,2611,0.045057,0.558333,0.179167,0.212500,0.050000,240,186,1.290323,10.879167


## Cleaning missing data

In [109]:
df = df[df["text"] != '']

In [111]:
df["text"] = df["text"].apply(str)

## Sentiment analysis

In [112]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

# Calculate the sentiment scores for all texts
sentiment_scores = df['text'].apply(lambda text: sia.polarity_scores(text))

# Extract negative, positive, and neutral scores into separate columns
df['negative'] = sentiment_scores.apply(lambda score: score['neg'])
df['positive'] = sentiment_scores.apply(lambda score: score['pos'])
df['neutral'] = sentiment_scores.apply(lambda score: score['neu'])

In [113]:
df.head()

Unnamed: 0,text,sentences,question_marks_ratio,exclamation_marks_ratio,links,emails_addresses,chars,capital_letters_ratio,noun_ratio,adj_ratio,verb_ratio,adv_ratio,words,unique_words,lexical_div,avg_word_length,negative,positive,neutral
12727,article robert castro writes would anyone dodl...,14,0.071429,0.571429,0,3,1335,0.058878,0.562044,0.167883,0.218978,0.051095,137,117,1.17094,9.744526,0.033,0.139,0.828
12958,article jerry hartzler cat write article micha...,5,0.0,0.0,0,5,402,0.072508,0.538462,0.153846,0.205128,0.102564,39,36,1.083333,10.307692,0.068,0.093,0.839
17116,try unsubscribe group send email doesnt work c...,4,0.25,0.0,0,1,397,0.081818,0.595238,0.238095,0.142857,0.02381,42,40,1.05,9.452381,0.027,0.107,0.867
6186,article john eaton write thats one problem far...,13,0.230769,0.0,0,2,1334,0.018051,0.583333,0.166667,0.191667,0.058333,120,109,1.100917,11.116667,0.129,0.218,0.653
6782,key question whether nonclipper encryption mak...,16,0.125,0.0,0,1,1906,0.018657,0.47929,0.224852,0.242604,0.053254,169,134,1.261194,11.278107,0.12,0.231,0.649


## Vectorizing

### CountVectorizer

In [None]:
import warnings
import re
from sklearn.feature_extraction.text import CountVectorizer

def custom_tokenizer(text):
    # Split the text into tokens using whitespace and punctuation as separators
    tokens = re.findall(r'\b\w+\b', text)

    # Convert integers to strings
    tokens = [str(token) for token in tokens]

    return tokens

# Suppress the warning
warnings.filterwarnings("ignore", category=UserWarning)

count_vectorizer = CountVectorizer(tokenizer=custom_tokenizer, stop_words = "english")
count_df = count_vectorizer.fit_transform(df["text"])
count_df

### TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words = "english", max_df=0.7)
tfidf_df = tfidf_vectorizer.fit_transform(df["text"])
tfidf_df

In [None]:
count_df = pd.DataFrame(count_df.A, columns = count_vectorizer.get_feature_names_out())
tfidf_df = pd.DataFrame(tfidf_df.A, columns = tfidf_vectorizer.get_feature_names_out())
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)