# Machine Learning: Text Classification Assignment

In [0]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split as tts
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

### Use the CategorizedPlaintextCorpusReader to import the movie reviews corpus

In [0]:
from nltk.corpus import movie_reviews

In [21]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [0]:
nltk.download('punkt')
DOC_PATTERN = r'.*\.txt'
CAT_PATTERN = r'([\w_\s]+/.*)'

corpus = PlaintextCorpusReader(, DOC_PATTERN)
corpus = CategorizedPlaintextCorpusReader('path', DOC_PATTERN, cat_pattern=CAT_PATTERN)

### Create two separate lists - one containing the text from each document and another containing the category of each article in the corpus.

In [0]:
docs = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]

categories = [movie_reviews.categories(fileid)[0] for fileid in movie_reviews.fileids()]

In [0]:
movie_reviews.fileids()

In [0]:
docs

### Preprocess the corpus, ensuring to include the following steps.

- Word tokenize the documents.
- Lemmatize, stem, and lowercase all tokens.
- Remove punctuation and stop words.

In [39]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
def preprocess(docs):
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer('english')
    preprocessed = []

    for doc in docs:
        tokenized = word_tokenize(doc)
        cleaned = [stemmer.stem(lemmatizer.lemmatize(token.lower()))
                    for token in tokenized
                    if token.lower() not in stopwords.words('english')
                    if token.isalpha()]
        untokenized = " ".join(cleaned)
        preprocessed.append(untokenized)
    return preprocessed

In [0]:
text_movie = preprocess(docs)

### Split the data into training and testing sets with the size of the test set being 30% of the records.

In [0]:
# We split the dataset into train and test sets
X_train, X_test, y_train, y_test = tts (text_movie, categories, test_size=0.3, random_state=23)

### Construct a pipeline that TF-IDF vectorizes the text and trains a Random Forest classification model.

In [44]:
model = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier()),])
model.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

### Generate predictions on the test set and print a classification report to evaluate how well the model performed.

In [45]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         neg       0.76      0.85      0.80       305
         pos       0.82      0.72      0.77       295

    accuracy                           0.79       600
   macro avg       0.79      0.78      0.78       600
weighted avg       0.79      0.79      0.78       600



### Perform 10-fold cross validation and obtain the averge F1 score across all the folds.

In [47]:
scores = cross_val_score(model,text_movie,categories, cv=10, scoring = 'f1_macro')
scores.mean(), scores.std()

(0.7976178854770497, 0.013770689922035404)

### Ingest, preprocess, and predict the topic of the article at the following URL.

In [0]:
url = 'https://www.nytimes.com/2019/11/25/business/uber-london.html'

In [0]:
url = 'https://www.10news.com/lifestyle/exploring-san-diego/review-rise-of-the-resistance-fulfills-missing-star-wars-feel-at-galaxys-edge'

In [53]:
import requests
from bs4 import BeautifulSoup

def get_url_text(url):
    response = requests.get(url)
    content = response.text

    TAGS = ['h1','h2','h3','h4','h5','h6','h7','p','li']
    soup = BeautifulSoup(content, 'lxml')
    text_list = [tag.get_text() for tag in soup.find_all(TAGS)]
    text = ' '.join(text_list)
    return text

text = get_url_text(url)
cleaned = preprocess([text])
model.predict(cleaned)[0]

'pos'