# News Group Classification

## Preprocessing , Cleaning and Feature Extraction

### imports

In [2]:
import re
import nltk
from nltk.stem import SnowballStemmer,LancasterStemmer,PorterStemmer
from nltk.corpus import stopwords
from textblob import Word

### cleaning

In [3]:
# main
def cleaning(dataframe):
    lowercase(dataframe)
    replace_email(dataframe)
    remove_symbols(dataframe)
    replace_numbers(dataframe)
    remove_gibberish(dataframe)
    return dataframe

In [4]:

def lowercase(dataframe):
    dataframe['Article'] = dataframe['Article'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    return dataframe

In [5]:
def replace_email(dataframe):
    regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    # dataframe['Article'] = dataframe['Article'].apply(lambda x: (re.sub(regex, " ", x)))
    dataframe['Article'] = dataframe['Article'].apply(lambda x: (re.sub(regex, "emailadd", x)))
    return dataframe

In [6]:

def remove_symbols(dataframe):
    # regex = r'[\p{P}\p{S}]'
    # regex = r'\W+'
    regex = r'[^a-zA-Z0-9]+'
    dataframe['Article'] = dataframe['Article'].apply(lambda x: re.sub(regex," ",x))
    return dataframe

In [7]:

def replace_numbers(dataframe):
    regex = r'\d+[\.\-\d]*'
    dataframe['Article'] = dataframe['Article'].apply(lambda x: re.sub(regex,"numb",x))
    return dataframe

In [8]:

def remove_gibberish(dataframe):
    nltk.download('words')
    words = set(nltk.corpus.words.words())
    dataframe['Article'] = dataframe['Article'].apply(lambda x: " ".join(w for w in nltk.wordpunct_tokenize(x) if w in words or not w.isalpha()))
    return dataframe

### preprocessing

In [9]:
# chain of functions for preprocessing
def preprocessing(dataframe):
    # cleaning
    dataframe = cleaning(dataframe)
    # preprocesing
    dataframe =  remove_stopWords(dataframe)
    # lemmatize(dataframe)

    # Stemmer(dataframe, PorterStemmer())
    # Stemmer(dataframe, LancasterStemmer())
    dataframe =  Stemmer(dataframe, SnowballStemmer("english"))
    return dataframe

In [10]:

def remove_stopWords(dataframe):
    nltk.download('stopwords')
    stop = stopwords.words('english')
    dataframe['Article'] = dataframe['Article'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    return dataframe

In [11]:

def Stemmer(dataframe,st):
    dataframe['Article'] = dataframe['Article'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
    return dataframe

In [12]:

def lemmatize(dataframe):
    dataframe['Article'] = dataframe['Article'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    return dataframe

### Feature extraction

In [13]:
def extract_features(train_x,valid_x): 
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w+')
    tfidf_vect.fit(df['Article'])
    xtrain = tfidf_vect.transform(train_x)
    xtest = tfidf_vect.transform(valid_x)
    return xtrain,xtest

## Model Training

### imports

In [14]:
from pathlib import Path
from preprocessing import preprocess
from sklearn import preprocessing
# from model1 import linear
# from model2 import naive, naive_bayes
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics

import pandas as pd
import os

### Loading Data

In [15]:
Paths = {
    "data": r"../Data/20news-18828",  # contains subdirectories!
    "data_train": r"../Data/train",  # contains subdirectories!
    "data_test": r"../Data/test",  # contains subdirectories!
}

In [None]:

def load_data(path):
    dataframe = pd.DataFrame(columns=["Label", "Article"])
    dirs = os.listdir(path)
    for dir in dirs:
        for file in os.listdir(os.path.join(path, dir)):
            with open(os.path.join(path, dir, file)) as f:
                row = pd.DataFrame([{"Label": dir, "Article": f.read()}])
                dataframe = pd.concat([dataframe, row])
    return dataframe

In [None]:

def read_data(path):
    dataframe = load_data(path)
    dataframe = dataframe.sample(frac=1, random_state=45).reset_index()
    return dataframe

### Setting up the model

In [16]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid,
                valid_y):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    return metrics.accuracy_score(predictions, valid_y)

In [17]:
def encode_y(ytrain, ytest):
    encoder = preprocessing.LabelEncoder()
    ytrain = encoder.fit_transform(ytrain)
    ytest = encoder.fit_transform(ytest)
    return ytrain,ytest

## Main

In [18]:
df = read_data(Paths["data"])
# df = pd.read_csv("../Data/data.csv")
# df = df.sample(frac=1, random_state=1).reset_index()

df.to_csv("../Data/data.csv", index=False)

preprocess(df)

# data split
train_x, valid_x, ytrain, ytest = model_selection.train_test_split(
    df['Article'], df['Label'])

encode_y(ytrain, ytest)

xtrain,xtest = extract_features(train_x,valid_x)

accuracy = train_model(naive_bayes.MultinomialNB(alpha=1e-10), xtrain,
                        ytrain, xtest, ytest)
print("Accuracy: ", accuracy)
accuracy = train_model(linear_model.LogisticRegression(max_iter=1000),
                        xtrain, ytrain, xtest, ytest)
print("Accuracy: ", accuracy)

Accuracy:  0.7837263649883153
Accuracy:  0.8253664754620778
