In [87]:
import argparse
import re
from typing import Tuple
import nltk
from nltk.corpus import twitter_samples
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
import pandas as pd
import numpy as np
from torchtext.data.utils import get_tokenizer
from sklearn.metrics import classification_report
from typing import List
from nltk.tokenize import TweetTokenizer

In [103]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

from pprint import pprint
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [2]:
path = '.cache'
nltk.download('twitter_samples', path)
nltk.data.path.append(path)

[nltk_data] Downloading package twitter_samples to .cache...
[nltk_data]   Package twitter_samples is already up-to-date!


In [3]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
pos = pd.DataFrame({'X':positive_tweets,'y':1})
neg = pd.DataFrame({'X':negative_tweets,'y':0})
dataset = pd.concat([pos,neg]).sample(frac=1,random_state=42)

In [5]:
trainset = dataset[:8000]
testset = dataset[8000:]

In [100]:
def tokenize(tweet: str) -> List[str]:
##    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
#                               reduce_len=True)
#    tweet_tokens = tokenizer.tokenize(tweet)
    return tweet.split()

In [101]:
#tokenize=get_tokenizer('basic_english')
vectorizer = TfidfVectorizer(tokenizer=tokenize,
                                 ngram_range=(1, 1),
                                 min_df=5,
                                 max_features=25000)

classifier = LogisticRegression()

pipeline = Pipeline([('vectorizer', vectorizer), ('classifier', classifier)])



In [102]:
pipeline.fit(X=trainset['X'],y=trainset['y'])

predicted = pipeline.predict(testset['X'])
np.mean(predicted == testset['y'])

0.989

In [97]:
classification_report(testset['y'],predicted,output_dict=True)

{'0': {'precision': 0.9980411361410382,
  'recall': 0.9990196078431373,
  'f1-score': 0.9985301322880941,
  'support': 1020},
 '1': {'precision': 0.9989785495403473,
  'recall': 0.9979591836734694,
  'f1-score': 0.998468606431853,
  'support': 980},
 'accuracy': 0.9985,
 'macro avg': {'precision': 0.9985098428406927,
  'recall': 0.9984893957583033,
  'f1-score': 0.9984993693599735,
  'support': 2000},
 'weighted avg': {'precision': 0.9985004687066996,
  'recall': 0.9985,
  'f1-score': 0.998499984618536,
  'support': 2000}}

In [107]:
pipeline.predict(['sad'])

array([0])