In [1]:
#! /usr/bin/env python

import tensorflow as tf
import pandas as pd
import numpy as np
import os
import data_helpers
from tensorflow.contrib import learn
import csv
from sklearn import metrics
import yaml


# point this toward your JAVA bin
os.environ['JAVAHOME'] = 'C:\\Program Files\\Java\\jre1.8.0_144\\bin'

# Set TensorFlow Parameters

In [2]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    if x.ndim == 1:
        x = x.reshape((1, -1))
    max_x = np.max(x, axis=1).reshape((-1, 1))
    exp_x = np.exp(x - max_x)
    return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1))

with open("config.yml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile)


In [3]:
# Parameters
# ==================================================

# Data Parameters

# Eval Parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_string("checkpoint_dir", r"runs\1523831946\checkpoints", "Checkpoint directory from training run")
tf.flags.DEFINE_boolean("eval_train", False, "Evaluate on all training data")

# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


FLAGS = tf.flags.FLAGS
#FLAGS._parse_flags()
#print("\nParameters:")
#for attr, value in sorted(FLAGS.__flags.items()):
#    print("{}={}".format(attr.upper(), value))
#print("")

# Fetch Tweets for user

In [21]:
import tweepy
import json
from datetime import datetime
import re

### Twitter API Auth Info.

#Enter your Twitter API Authentication info below 
access_token = ""
access_token_secret = ""
consumer_key = ""
consumer_secret = ""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
    
def get_user_tweets(query, count,include_rts=True):
    
    api = tweepy.API(auth_handler=auth, wait_on_rate_limit = True , wait_on_rate_limit_notify = True)
    
    try:
        results = api.user_timeline(screen_name = query, count = count, include_rts = include_rts)
        return(results)
    except tweepy.TweepError as e:
        print(e)
        return(None)

In [22]:
user_tweets = get_user_tweets('unsaifi',count=100)
raw_tweets = []
for i in user_tweets:
    json = i._json
    raw_tweets.append(json['text'])

In [23]:
# CHANGE THIS: Load data. Load your own data here
dataset_name = cfg["datasets"]["default"]
datasets = {"target_names" : [i for i in os.listdir('data/input/SentenceCorpus/')]}
x_raw = raw_tweets
y_test = None

# Use Saved TensorFlow Model to Predict Labels for Tweets

In [24]:
# Map data into vocabulary
vocab_path = r"runs\1523831946\checkpoints\..\vocab"
#vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(r"runs\1523831946\checkpoints")
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=True,
      log_device_placement=False)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Load the saved meta graph and restore variables
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        # Get the placeholders from the graph by name
        input_x = graph.get_operation_by_name("input_x").outputs[0]
        # input_y = graph.get_operation_by_name("input_y").outputs[0]
        dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

        # Tensors we want to evaluate
        scores = graph.get_operation_by_name("output/scores").outputs[0]

        # Tensors we want to evaluate
        predictions = graph.get_operation_by_name("output/predictions").outputs[0]

        # Generate batches for one epoch
        batches = data_helpers.batch_iter(list(x_test), 64, 1, shuffle=False)

        # Collect the predictions here
        all_predictions = []
        all_probabilities = None

        for x_test_batch in batches:
            batch_predictions_scores = sess.run([predictions, scores], {input_x: x_test_batch, dropout_keep_prob: 1.0})
            all_predictions = np.concatenate([all_predictions, batch_predictions_scores[0]])
            probabilities = softmax(batch_predictions_scores[1])
            if all_probabilities is not None:
                all_probabilities = np.concatenate([all_probabilities, probabilities])
            else:
                all_probabilities = probabilities

# Print accuracy if y_test is defined
if y_test is not None:
    correct_predictions = float(sum(all_predictions == y_test))
    print("Total number of test examples: {}".format(len(y_test)))
    print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))
    print(metrics.classification_report(y_test, all_predictions, target_names=datasets['target_names']))
    print(metrics.confusion_matrix(y_test, all_predictions))

# Save the evaluation to a csv
predictions_human_readable = np.column_stack((np.array(x_raw),
                                              [int(prediction) for prediction in all_predictions],
                                              [ "{}".format(probability) for probability in all_probabilities]))
#out_path = os.path.join("prediction.csv")
#print("Saving evaluation to {0}".format(out_path))#
#with open(out_path, 'w') as f:
#    csv.writer(f).writerows(predictions_human_readable)

print('\n*********************** Evaluation Complete ***********************')


Evaluating...

INFO:tensorflow:Restoring parameters from C:\Users\Saifi\Py_Projects\Applied_AI\cnn-text-classification-tf-cahya\runs\1523831946\checkpoints\model-200

*********************** Evaluation Complete ***********************


# Let's have a look at the predicted categories

In [25]:
df = pd.DataFrame(predictions_human_readable,columns=['text','pred_label_number','probs']).drop(labels='probs',axis=1)
df['pred_label_text'] = df['pred_label_number'].map(lambda x: datasets['target_names'][int(x)])

In [26]:
df.head(10)

Unnamed: 0,text,pred_label_number,pred_label_text
0,"RT @SkySportsPL: WATCH: Rivals for 22 years, f...",3,entertainment
1,"RT @FaseehMangi: There are about 10,000 solar ...",8,tech
2,"RT @charliebilello: $10,000 invested in the Am...",1,business
3,"RT @FaseehMangi: As global prices drop, solar ...",1,business
4,RT @spectatorindex: BREAKING: South Korea will...,5,politics
5,"RT @Channel4News: ""He's put up with the most a...",7,sport
6,RT @Arsenal: Legend.\n\n#MerciArsène https://t...,7,sport
7,RT @AP: BREAKING: President Trump: U.S. strike...,5,politics
8,RT @business: China's billions are set to revi...,1,business
9,RT @MaxCRoser: Why numbers are sometimes bette...,3,entertainment


In [27]:
df['pred_label_text'].value_counts()

sport            35
politics         21
tech             17
business         13
entertainment     9
tennis            4
Name: pred_label_text, dtype: int64

# Named Entity Recognition (Stanford)

In [12]:
from nltk.tag import pos_tag
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

# Change the path according to your system
stanford_classifier = r'C:\Users\Saifi\AppData\Local\Continuum\Anaconda3\Scripts\stanford-ner\classifiers\english.all.3class.distsim.crf.ser.gz'
stanford_ner_path = r'C:\Users\Saifi\AppData\Local\Continuum\Anaconda3\Scripts\stanford-ner\stanford-ner.jar'
java_path = r"C:\Program Files\Java\jre1.8.0_144\bin" # replace this
os.environ['JAVAHOME'] = java_path

# Creating Tagger Object
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')


def custom_NER(text):
    
    text = text.replace("’",'')
    text = text.strip()
    tokenized_tweet = word_tokenize(text)
    classified_text = [st.tag(tokenized_tweet)]
    
    person  = []
    location = []
    org = []
    
    try:
        for ct in classified_text:
            for i in range(1,len(ct) - 1):
                
                if (ct[i][1] == 'PERSON' and ct[i+1][1] == 'PERSON') or (ct[i][1] == 'LOCATION' and ct[i+1][1] == 'LOCATION') or (ct[i][1] == 'ORGANIZATION' and ct[i+1][1] == 'ORGANIZATION'):
                    joined_word = ct[i][0] + ' ' + ct[i+1][0]
                    if ct[i][1] == 'PERSON':
                        person.append(joined_word)
                    elif ct[i][1] == 'LOCATION':
                        location.append(joined_word)
                    else:
                        org.append(joined_word)

                elif (ct[i][1] == 'PERSON' and ct[i+1][1] != 'PERSON' and ct[i-1][1] != 'PERSON') or (ct[i][1] == 'LOCATION' and ct[i+1][1] != 'LOCATION' and ct[i-1][1] != 'LOCATION') or (ct[i][1] == 'ORGANIZATION' and ct[i+1][1] != 'ORGANIZATION' and ct[i-1][1] != 'ORGANIZATION'):
                    joined_word = ct[i][0]
                    if ct[i][1] == 'PERSON':
                        person.append(joined_word)
                    elif ct[i][1] == 'LOCATION':
                        location.append(joined_word)
                    else:
                        org.append(joined_word)
    
        result = {'person' : person,
                 'location' : location,
                 'org' : org}
        
        return result

    except Exception as e:
        print(e)
        return(None)

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)


# Get Most Frequent People, Organizations & Locations mentioned in Tweets

In [13]:
ner_results_by_label = []

for category in df['pred_label_text'].unique():
    sub_df = df[df['pred_label_text'] == category]
    
    
    for tweet in sub_df['text'].values:
        ner_result = custom_NER(tweet)
        ner_result['category'] = category
        ner_results_by_label.append(ner_result)

In [14]:
df_NER = pd.DataFrame(ner_results_by_label)

In [15]:
NER_counts = []

for type_of_NER in ['location','org','person']:
    res = pd.DataFrame()
    res = df_NER.set_index(['category'])[type_of_NER].apply(pd.Series).stack()
    res = res.reset_index()
    res.drop(labels='level_1',axis=1,inplace=True)
    res.rename({0:type_of_NER},axis=1,inplace=True)
    for i in res.to_dict('records'):
        NER_counts.append(i)

    #print("\nMost Common ", type_of_NER)    
    #for cat in res['category'].unique():
    #    print('Top 5 in ', cat)
    #    print(res[res['category'] == 'sport'][type_of_NER].value_counts())

### Top 5 Locations Mentioned

In [16]:
if len(pd.DataFrame(NER_counts)['location'].value_counts()) < 5:
    print(pd.DataFrame(NER_counts)['location'].value_counts())
else:
    print(pd.DataFrame(NER_counts)['location'].value_counts()[:5])

Pakistan        12
China            3
Saudi Arabia     2
U.S.             2
North Korea      1
Name: location, dtype: int64


### Top 5 Organizations Mentioned

In [17]:
if len(pd.DataFrame(NER_counts)['org'].value_counts()) < 5:
    print(pd.DataFrame(NER_counts)['org'].value_counts())
else:
    print(pd.DataFrame(NER_counts)['org'].value_counts()[:5])

Google              2
Republican Party    1
Amazon IPO          1
Roy Moore           1
School Peshawar     1
Name: org, dtype: int64


### Top 5 Most Frequent People Mentioned

In [18]:
if len(pd.DataFrame(NER_counts)['person'].value_counts()) < 5:
    print(pd.DataFrame(NER_counts)['person'].value_counts())
else:
    print(pd.DataFrame(NER_counts)['person'].value_counts()[:5])

Malala         5
FaseehMangi    3
Ive            2
BarackObama    2
Yves Tanguy    1
Name: person, dtype: int64


### Top 5 by Predicted Label

In [19]:
df_NER_counts = pd.DataFrame(NER_counts)

In [20]:
for i in df_NER_counts['category'].unique():
    
    print('\nCategory : ', i,'\n')
    
    sub_df = df_NER_counts[df_NER_counts['category'] == i]
    
    for j in ['location','org','person']:
        if len(sub_df[j].value_counts()) < 5:
            print(sub_df[j].value_counts(),'\n')
        else:
            print(sub_df[j].value_counts()[:5],'\n')


Category :  entertainment 

Diwali      1
Pakistan    1
Name: location, dtype: int64 

Manchester United    1
LUMS                 1
Name: org, dtype: int64 

Asma Jahangir    1
Arsene Wenger    1
Malala           1
Name: person, dtype: int64 


Category :  tech 

Boston    1
Name: location, dtype: int64 

Google      2
McKinsey    1
AT &        1
& amp       1
IHC         1
Name: org, dtype: int64 

Letterman         1
Malala            1
S. Fischer        1
Fischer Verlag    1
Dave              1
Name: person, dtype: int64 


Category :  business 

Pakistan          3
China             2
Michigan State    1
Asia              1
Cambodia          1
Name: location, dtype: int64 

USA Gymnastics         1
MacDonalds Szechuan    1
Harvard                1
Amazon IPO             1
Bloomberg              1
Name: org, dtype: int64 

FaseehMangi     2
Bitcoin         1
Larry Nassar    1
Kim Jong-Un     1
Name: person, dtype: int64 


Category :  politics 

Pakistan        3
Saudi Arabia    2