# Machine Learning: Text Classification Assignment

In [4]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split as tts
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
import pandas as pd

### Use the CategorizedPlaintextCorpusReader to import the AP_News corpus.

In [None]:
path = 'AP_News/'
DOC_PATTERN = r'.*\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'
corpus = CategorizedPlaintextCorpusReader(path, DOC_PATTERN, cat_pattern=CAT_PATTERN)
corpus.fileids()

### Create two separate lists - one containing the text from each document and another containing the category of each article in the corpus.

In [None]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]
docs[0]

In [None]:
corpus.categories(corpus.fileids()[0])[0]

In [None]:
categories = [corpus.categories(fileid)[0] for fileid in corpus.fileids()]
categories

### Preprocess the corpus, ensuring to include the following steps.

- Word tokenize the documents.
- Lemmatize, stem, and lowercase all tokens.
- Remove punctuation and stop words.

In [None]:
def preprocess(docs):
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer('english')
    preprocessed = []

    for doc in docs:
        tokenized = word_tokenize(doc)
        cleaned = [stemmer.stem(lemmatizer.lemmatize(token.lower())) 
                   for token in tokenized 
                   if token.lower() not in stopwords.words('english')
                  if token.isalpha()]
        untokenized = " ".join(cleaned)
        preprocessed.append(untokenized)
    
    return preprocessed

In [None]:
preprocessed = preprocess(docs)
preprocessed[0

### Split the data into training and testing sets with the size of the test set being 30% of the records.

In [None]:
print(len(categories))
len(preprocessed)

In [None]:
X_train, X_test, y_train, y_test = tts(preprocessed, categories, test_size=0.3)

### Construct a pipeline that TF-IDF vectorizes the text and trains a Random Forest classification model.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
rfc = RandomForestClassifier(n_estimators=100)
pipe = Pipeline([
    ('vect', tfidf),
    ('clf', rfc)
])

pipe.fit(X_train, y_train)

### Generate predictions on the test set and print a classification report to evaluate how well the model performed.

In [None]:
y_pred_test = pipe.predict(X_test)
print(classification_report(y_test, y_pred_test))

### Perform 10-fold cross validation and obtain the averge F1 score across all the folds.

In [None]:

scores = cross_val_score(pipe, preprocessed, categories, cv=10, scoring='f1_macro')
scores

In [None]:
scores.mean()

### Ingest, preprocess, and predict the topic of the article at the following URL.

In [3]:
pipe.fit(preprocessed, categories)

NameError: ignored

In [None]:
url = 'https://www.nytimes.com/2019/11/25/business/uber-london.html'

In [None]:

import requests
from bs4 import BeautifulSoup

result = requests.get(url)
soup = BeautifulSoup(result.text)
doc = soup.find('section', attrs={'name':'articleBody'}).text
preprocessed = preprocess([doc])
pipe.predict(preprocessed)