## Tweets topic classification

In [None]:
%cd tweets_topics

/content/tweets_topics


In [None]:
# Importing NLP tools
import re, nltk, glob, random, pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier

# Importing Machine Learning Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier

from helper import get_features
from config import save_features, save_model, save_model_dir, output_dir, inference_dir


In [None]:
 
import warnings
warnings.filterwarnings("ignore")
 
nltk.download('stopwords')
nltk.download('punkt')
 
filenames = glob.glob("train_data/*.txt")
print("\nList of categories:\n")
[print(file) for file in filenames]
 
labelled_tweets = []
all_words = []
stpwords = stopwords.words('english')
for filename in filenames:
    file = open(filename, encoding='utf-8').read()
    for tweet in file.split('\n'): # splitting text file in each line
 
        tweet = re.sub(r'[^\w\s]', '', tweet)
        # ^ within a set ([]) will exclude the expression class to it's right
        #  \w Matches alphanumeric characters, which means a-z, A-Z, and 0-9. It also matches the underscore, _.
        #  \s Matches non-whitespace characters.
        # Hence all characters except \w and \s will be removed
 
        tweet = re.sub(" \d+", " ", tweet)
        # replaces \d (digits) with space
 
        tweet = [i.lower() for i in list(set(word_tokenize(tweet)) - set(stpwords))]
        # word_tokenize - returns a tokenized copy of text
        # Hence, list(set(word_tokenize(tweet)) - set(stpwords)) is removing stop words from tokenized copy of text and lower() sets them to lowercase
 
        all_words += tweet # adding pre-processed tweets to all_words list 
        labelled_tweets.append((tweet, filename[5:-4]))  # extract category name from filename
 
# shuffling list of tweet keywords
random.shuffle(labelled_tweets)
 
word_features = list(all_words)
 
print("Generating features from data!")
# contains a set of features generated using train data with boolean values whether the tweet contains it or not
feature_set = [(get_features(text), label) for (text, label) in labelled_tweets]
 
n = 500
train_feature_set = feature_set[n:]
test_feature_set = feature_set[:n]

In [None]:

# Dictionary of classifiers
classifier_dict = {'Linear_SVC':SklearnClassifier(LinearSVC()),
                   'Naive_Bayes':nltk.NaiveBayesClassifier,
                   'Logistic_Regression':SklearnClassifier(LogisticRegression(multi_class='ovr')),  # one-vs-rest
                   'Multinomial_Naive_Bayes':SklearnClassifier(MultinomialNB()),
                   'SGD_Classifier':SklearnClassifier(SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None))
                   }

# save features
if save_features == True:
    print("Saving Features!")
    save_word_features = open(save_model_dir+"word_features.pickle", "wb")
    pickle.dump(word_features, save_word_features)
    save_word_features.close()

# training and saving models
for classifier_name, classifier_object in classifier_dict.items():
    classifier = classifier_object.train(train_feature_set)
    print("Training {}...".format(classifier_name))
    print("{} is {}% accurate.\n".format(classifier_name, \
                                        (nltk.classify.accuracy(classifier, test_feature_set)) * 100))
    if save_model == True:
        print("Saving Trained {} model.".format(classifier_name))
        classifier_file = open("{}.pickle".format(save_model_dir+classifier_name), "wb")
        pickle.dump(classifier, classifier_file)
        classifier_file.close()

Saving Features!
Training Linear_SVC...
Linear_SVC is 74.2% accurate.

Saving Trained Linear_SVC model.
Training Naive_Bayes...
Naive_Bayes is 72.39999999999999% accurate.

Saving Trained Naive_Bayes model.
Training Logistic_Regression...
Logistic_Regression is 75.6% accurate.

Saving Trained Logistic_Regression model.
Training Multinomial_Naive_Bayes...
Multinomial_Naive_Bayes is 75.0% accurate.

Saving Trained Multinomial_Naive_Bayes model.
Training SGD_Classifier...
SGD_Classifier is 75.6% accurate.

Saving Trained SGD_Classifier model.


In [None]:
# !python train.py

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

List of categories:

Traceback (most recent call last):
  File "train.py", line 88, in <module>
    classifier = classifier_object.train(train_feature_set)
  File "/usr/local/lib/python3.6/dist-packages/nltk/classify/scikitlearn.py", line 116, in train
    X, y = list(zip(*labeled_featuresets))
ValueError: not enough values to unpack (expected 2, got 0)


In [None]:
# prediction and producing output.txt
import pickle
import nltk
import pandas as pd
from helper import get_features

classifier = pickle.load(open('model/MNB.pickle', 'rb'))
word_features = pickle.load(open('model/word_features.pickle', 'rb'))

def predict_topic(tweet_text):
    tweet = nltk.word_tokenize(tweet_text.lower())
    return classifier.classify(get_features(tweet))


tweet_data = pd.read_csv('60tweets.csv')
# tweet_data.head()

output = open(output_dir+"output.txt", "w+")
for index, row in tweet_data.iterrows():
    output.write("{} {}\n".format(row.id, predict_topic(row.tweets)))
output.close()

business
business
politics
politics
technology
sports


In [None]:
%cd /content
!mkdir tweets_topics
%cd tweets_topics

/content
/content/tweets_topics


## Data download

In [None]:
!wget https://www.dropbox.com/s/xg4lred37b558el/stopwords.txt

--2020-12-11 01:17:29--  https://www.dropbox.com/s/xg4lred37b558el/stopwords.txt
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/xg4lred37b558el/stopwords.txt [following]
--2020-12-11 01:17:29--  https://www.dropbox.com/s/raw/xg4lred37b558el/stopwords.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc1163e22b45ebc1e97710c51a0c.dl.dropboxusercontent.com/cd/0/inline/BE3W99apAmqHgO71_7J3NyYUSjkvj1w2GMC6FJkNKTt5vT0cBgknUsa1oVQF6yySMo8rknvc5py5XGeq-iDDaR3t-ejvoLIkzKve6NOR-27otd78KBRNHonUcvJ6wjQChbw/file# [following]
--2020-12-11 01:17:30--  https://uc1163e22b45ebc1e97710c51a0c.dl.dropboxusercontent.com/cd/0/inline/BE3W99apAmqHgO71_7J3NyYUSjkvj1w2GMC6FJkNKTt5vT0cBgknUsa1oVQF6yySMo8rknvc5py5XGeq-iDDaR3t-ejvoLIkzKve6NOR-27o

In [None]:
!wget https://www.dropbox.com/s/7qc0hfq61ovabdp/60tweets.csv
!wget https://www.dropbox.com/s/dc76asx6a845sdt/datasets.zip
!wget https://www.dropbox.com/s/gcxs864fz93f1d4/utilities.py
!unzip datasets.zip -d datasets

--2020-12-11 01:09:25--  https://www.dropbox.com/s/7qc0hfq61ovabdp/60tweets.csv
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/7qc0hfq61ovabdp/60tweets.csv [following]
--2020-12-11 01:09:25--  https://www.dropbox.com/s/raw/7qc0hfq61ovabdp/60tweets.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc764485e035e7fa578f3c45016d.dl.dropboxusercontent.com/cd/0/inline/BE2nYAERNp8Im7GBEdCA_VtHcJmUwvciyhGe96_rsvfHFg946-X-LoXW61KUEBiZHhqBjXrG96Hok-HqYO90LEq4VBSi2VJVtWjeEGQnPRqg0vPal7gGGJ1tyKStS80D1UY/file# [following]
--2020-12-11 01:09:26--  https://uc764485e035e7fa578f3c45016d.dl.dropboxusercontent.com/cd/0/inline/BE2nYAERNp8Im7GBEdCA_VtHcJmUwvciyhGe96_rsvfHFg946-X-LoXW61KUEBiZHhqBjXrG96Hok-HqYO90LEq4VBSi2VJVtWjeEGQnPRqg0vP

In [None]:
!pip install git+https://github.com/LIAAD/yake

Collecting git+https://github.com/LIAAD/yake
  Cloning https://github.com/LIAAD/yake to /tmp/pip-req-build-5do9hpsa
  Running command git clone -q https://github.com/LIAAD/yake /tmp/pip-req-build-5do9hpsa
Collecting segtok
  Downloading https://files.pythonhosted.org/packages/41/08/582dab5f4b1d5ca23bc6927b4bb977c8ff7f3a87a3b98844ef833e2f5623/segtok-1.5.10.tar.gz
Collecting jellyfish
[?25l  Downloading https://files.pythonhosted.org/packages/6c/09/927ae35fc5a9f70abb6cc2c27ee88fc48549f7bc4786c1d4b177c22e997d/jellyfish-0.8.2-cp36-cp36m-manylinux2014_x86_64.whl (93kB)
[K     |████████████████████████████████| 102kB 4.6MB/s 
Building wheels for collected packages: yake, segtok
  Building wheel for yake (setup.py) ... [?25l[?25hdone
  Created wheel for yake: filename=yake-0.4.3-py2.py3-none-any.whl size=66280 sha256=ae1542617908f40dac565c3a7b76c3e8d2d8982a064163c8f8a76119d6f56ab3
  Stored in directory: /tmp/pip-ephem-wheel-cache-3zkvhbci/wheels/be/35/27/e4ebd54b78c1806ed8b0271ce247fcd91e

In [None]:
! python -m spacy download en_core_web_sm
! python -m spacy validate
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: /usr/local/lib/python3.6/dist-packages/spacy[0m

TYPE      NAME             MODEL            VERSION                            
package   en-core-web-sm   en_core_web_sm   [38;5;2m2.2.5[0m   [38;5;2m✔[0m
link      en               en_core_web_sm   [38;5;2m2.2.5[0m   [38;5;2m✔[0m

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

https://gist.githubusercontent.com/sebleier/554280/raw/7e0e4a1ce04c2bb7bd41089c9821dbcf6d0c786c/NLTK's%2520list%2520of%2520english%2520stopwords

https://github.com/kavgan/nlp-in-practice/blob/master/tf-idf/Keyword%20Extraction%20with%20TF-IDF%20and%20SKlearn.ipynb

In [None]:
input_file = 'datasets/training_set.csv'
input_type = 'train'

## Keyword extraction using RAKE

In [None]:
import pandas as pd
tweet_data = pd.read_csv(input_file)
len(tweet_data)

1120000

In [None]:
%%time
model_name = 'yake'
import yake
import pandas as pd
tweet_data = pd.read_csv(input_file)
# tweet_data.head()

# yake specific parameters
language = "en"
max_ngram_size = 2 # decrease the size if you want smaller keywords
deduplication_thresold = 0.9
deduplication_algo = 'seqm'
windowSize = 1
numOfKeywords = 1 # make sure to adjust no. of keywords

custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)



ids = []
keywords_list = []

for index, row in tqdm(tweet_data.iterrows(), total=len(tweet_data)):
    keywords = custom_kw_extractor.extract_keywords(row.tweets)
    # print(index, keywords)
    ids.append(row.id)
    if keywords == []:
        keyword = 'nan'
    else:
        keyword = keywords[-1][-1]
    keywords_list.append(keyword)

datatype = 'txt'
output_textfile = "{}_{}_output.{}".format(model_name, input_type, datatype)
output = open(output_textfile, "w+")
print("Creating "+output_textfile)
for index, text in zip(ids, keywords_list):
    output.write("{} {}\n".format(index, text))
output.close()


datatype = 'csv'
output_textfile = "{}_{}_output.{}".format(model_name, input_type, datatype)
print("Creating "+output_textfile)
tweet_keywords_df = pd.DataFrame(columns=['tweet_id','keyword'])
tweet_keywords_df['tweet_id'] = ids
tweet_keywords_df['keyword'] = keywords_list
tweet_keywords_df.to_csv(output_textfile)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Creating yake__output.txt
Creating yake__output.csv
CPU times: user 208 ms, sys: 5.03 ms, total: 213 ms
Wall time: 254 ms


 ## keyword extraction using POS and pronoun extraction

In [None]:
from tqdm.notebook import tqdm

In [None]:
%%time
model_name = 'NLP_Base'
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from utilities import pre_process, get_stop_words

import re
import spacy
from collections import Counter
from string import punctuation
# nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()

#####################--Functions--#######################

def get_hotwords(text):
    result = []
    pos_tag = ['PROPN','NOUN'] # 1 'ADJ', 'NOUN'
    doc = nlp(text.lower()) # 2
    for token in doc:
        # 3
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        # 4
        if(token.pos_ in pos_tag):
            result.append(token.text)
        
    return result # 5


def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

def get_all_keywords(tweet):
    # get the document that we want to extract keywords from
    doc=tweet

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)

    return keywords

def get_keyword(tweet):
    hotwords = get_hotwords(tweet)

    # get the document that we want to extract keywords from
    doc=tweet

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 5
    keywords=extract_topn_from_vector(feature_names,sorted_items,5)
    final_keywords = dict()
    for key, value in keywords.items():
        if key in hotwords:
            # final_keywords_list.update({key,value})
            final_keywords[key]=value

    if final_keywords == {}:
        if hotwords == []:
            return 'na'
        return hotwords[0]

    keyword = max(final_keywords, key= lambda x: final_keywords[x])
    return keyword

##########################--Main Program--###############################


# if __name__ == "__main__":
    
    # tweet_data = pd.read_csv(input_file)
    # df = tweet_data
    # df['tweets'] = df['tweets'].apply(lambda x:pre_process(x))

    # # #load a set of stop words
    # stopwords=get_stop_words("stopwords.txt")

    # #get the text column 
    # docs=df['tweets'].tolist()

    # #create a vocabulary of words, 
    # #ignore words that appear in 85% of documents, 
    # #eliminate stop words
    # cv=CountVectorizer(max_df=0.85,stop_words=stopwords)
    # word_count_vector=cv.fit_transform(docs)

    # tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    # tfidf_transformer.fit(word_count_vector)

    # # you only needs to do this once
    # feature_names=cv.get_feature_names()

    # ids = []
    # keywords = []

    # for index, row in tqdm(tweet_data.iterrows(), total = len(tweet_data)):
    #     keyword = get_keyword(row.tweets)
    #     ids.append(row.id)
    #     keywords.append(keyword)
    #     # print(row.id, keyword)

    # datatype = 'txt'
    # output_textfile = "{}_{}_output.{}".format(model_name, input_type, datatype)
    # print("Creating "+output_textfile)
    # output = open(output_textfile, "w+") # for text file
    # for index, text in zip(ids, keywords):
    #     output.write("{} {}\n".format(index, text))
    # output.close() # closing and saving text file

    # datatype = 'csv'
    # output_textfile = "{}_{}_output.{}".format(model_name, input_type, datatype)
    # print("Creating "+output_textfile)
    # tweet_keywords_df = pd.DataFrame(columns=['tweet_id','keyword'])
    # tweet_keywords_df['tweet_id'] = ids
    # tweet_keywords_df['keyword'] = keywords
    # tweet_keywords_df.to_csv(output_textfile)

CPU times: user 677 ms, sys: 38 ms, total: 715 ms
Wall time: 716 ms


In [None]:
%%time
ids = []
keywords = []

for index, row in tqdm(tweet_data.iterrows(), total = len(tweet_data)):
    keyword = get_keyword(row.tweets)
    ids.append(row.id)
    keywords.append(keyword)
    # print(row.id, keyword)

datatype = 'txt'
output_textfile = "{}_{}_output.{}".format(model_name, input_type, datatype)
print("Creating "+output_textfile)
output = open(output_textfile, "w+") # for text file
for index, text in zip(ids, keywords):
    output.write("{} {}\n".format(index, text))
output.close() # closing and saving text file

datatype = 'csv'
output_textfile = "{}_{}_output.{}".format(model_name, input_type, datatype)
print("Creating "+output_textfile)
tweet_keywords_df = pd.DataFrame(columns=['tweet_id','keyword'])
tweet_keywords_df['tweet_id'] = ids
tweet_keywords_df['keyword'] = keywords
tweet_keywords_df.to_csv(output_textfile)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Buffered data was truncated after reaching the output size limit.

In [None]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [None]:
# increase speed by using this
%%timeit
tqdm.pandas()
tweet_data['keyword'] = tweet_data['tweets'].progress_apply(lambda row: custom_kw_extractor.extract_keywords(row)[-1][-1] \
                                         if custom_kw_extractor.extract_keywords(row) != [] \
                                         else 'nan')