<a href="https://colab.research.google.com/github/vidhishah9/News-Articles-Classification/blob/main/ece219Project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd


In [None]:
df = pd.read_csv('Project1-ClassificationDataset.csv')
print(df.columns.tolist())


['full_text', 'summary', 'keywords', 'publish_date', 'authors', 'url', 'leaf_label', 'root_label']


In [None]:
count_row = df.shape[0]
count_col = df.shape[1]
print(count_row)
print(count_col)

3476
8


In [None]:
import numpy as np
import random
np.random.seed(42)
random.seed(42)


In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df[["full_text","root_label"]], test_size=0.2)

In [None]:
import re
def clean(text):
  text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
  texter = re.sub(r"<br />", " ", text)
  texter = re.sub(r"&quot;", "\"",texter)
  texter = re.sub('&#39;', "\"", texter)
  texter = re.sub('\n', " ", texter)
  texter = re.sub(' u '," you ", texter)
  texter = re.sub('`',"", texter)
  texter = re.sub(' +', ' ', texter)
  texter = re.sub(r"(!)\1+", r"!", texter)
  texter = re.sub(r"(\?)\1+", r"?", texter)
  texter = re.sub('&amp;', 'and', texter)
  texter = re.sub('\r', ' ',texter)
  clean = re.compile('<.*?>')
  texter = texter.encode('ascii', 'ignore').decode('ascii')
  texter = re.sub(clean, '', texter)
  if texter == "":
    texter = ""
  return texter

In [None]:
train = train.map(clean)
test = test.map(clean)



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
#CountVectorizer - Converts text into a Bag-of-Words representation (word frequency counts).

#TfidTransformer - Converts word counts into TF-IDF scores (weights words based on importance)
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
import re

wnl = nltk.wordnet.WordNetLemmatizer()  #Lemmmatizing - reduces words into it's base form

analyzer = CountVectorizer().build_analyzer() #CountVectorizer - splits text into words, .build_analyzer is used to create an analyzer to do stuff with the text like lower case, remove punctuation, etc

#Below functions are all text processing functoins used before vectorization

#POS tagging in nltk.pos_tag() follows the Penn Treebank format.
#WordNetLemmatizer needs a different format (it expects n, v, a, r).
#This function maps Penn POS tags → WordNet tags.

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'


#Below functions
#Receives a list of words.
#Finds their part-of-speech (POS) tags.
#Lemmatizes them using the correct POS tags.

def lemmatize_sent(list_word):
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag))
            for word, tag in pos_tag(list_word)]

#Uses analyzer(doc) to split the text into words.
#Lemmatizes each word.
#Removes numbers (if not word.isdigit()).

def rmv_nums(doc):
    return (word for word in lemmatize_sent(analyzer(doc))
            if not word.isdigit())



In [None]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# CountVectorizer
vectorizer = CountVectorizer(analyzer = rmv_nums, min_df = 3, stop_words = 'english')




#Fits CountVectorizer on the training data:
#Extracts unique words (vocabulary) from train['full_text'].
#Converts text into a word count matrix.

#Step 1 and 2
# The fit() method starts processing each document in train['full_text'].
# For each document, CountVectorizer calls: rmv_nums(document)
# This function processes the raw text, e.g., removing numbers.
# It returns a list of words (tokens) for that document.


# Step 3: Building the Vocabulary
# After all documents are processed by rmv_nums, CountVectorizer:
  # Counts all words in the dataset.
  # Removes stopwords (if stop_words='english').
  # Removes words appearing in fewer than 3 documents (min_df=3).
  # Creates a dictionary (vocabulary) mapping words to unique indices.

# Step 4: vectorizer.transform() Converts Text into a Sparse Matrix
  # Now, transform() processes the original documents again:
    # Uses the learned vocabulary to create a word count matrix.
    # Each document becomes a row, where:
    # Columns correspond to words in the vocabulary.
    # Each cell contains the count of a word in that document.
    # This results in a sparse matrix (since most words don’t appear in every document).

X_train_counts = vectorizer.fit_transform(train['full_text'])
print(X_train_counts)
X_test_counts = vectorizer.transform(test['full_text'])

# TfidfTransformer
tfidf_transformer = TfidfTransformer()

# Fit and transform training data using TfidfTransformer
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Transform testing data using fitted TfidfTransformer
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Print Results
print("Shape of TF-IDF-processed train matrix:", X_train_tfidf.shape)
print("Shape of TF-IDF-processed test matrix:", X_test_tfidf.shape)




  (0, 13270)	2
  (0, 12103)	16
  (0, 4788)	2
  (0, 3076)	8
  (0, 365)	4
  (0, 3809)	3
  (0, 5952)	3
  (0, 1157)	13
  (0, 7753)	4
  (0, 12538)	5
  (0, 5155)	5
  (0, 4520)	5
  (0, 10827)	2
  (0, 12252)	12
  (0, 11933)	3
  (0, 5987)	7
  (0, 571)	2
  (0, 4153)	2
  (0, 4231)	1
  (0, 12985)	2
  (0, 12246)	1
  (0, 13322)	3
  (0, 5143)	2
  (0, 5609)	1
  (0, 4725)	2
  :	:
  (2779, 11862)	1
  (2779, 3825)	1
  (2779, 9941)	2
  (2779, 9313)	1
  (2779, 1108)	2
  (2779, 8108)	1
  (2779, 6828)	1
  (2779, 2336)	1
  (2779, 5141)	1
  (2779, 4077)	1
  (2779, 12320)	2
  (2779, 6693)	1
  (2779, 3054)	1
  (2779, 10435)	1
  (2779, 8129)	1
  (2779, 13474)	2
  (2779, 1787)	2
  (2779, 6756)	1
  (2779, 7215)	1
  (2779, 6871)	1
  (2779, 7426)	1
  (2779, 717)	1
  (2779, 4101)	1
  (2779, 1925)	1
  (2779, 10637)	1
Shape of TF-IDF-processed train matrix: (2780, 13594)
Shape of TF-IDF-processed test matrix: (696, 13594)


In [None]:
from sklearn.decomposition import NMF

model = NMF(n_components=50, init='random', random_state=0)
W_train = model.fit_transform(X_train_tfidf)

print(W_train.shape)

(2780, 50)


