In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle

%matplotlib inline

FILE_TEST = 'formatted_csv_dataset/formatted_test.tweet.csv'
FILE_TRAIN = 'formatted_csv_dataset/formatted_training.csv'

test_tweets_df = pd.read_csv(FILE_TEST, sep=',')
test_tweets_df = test_tweets_df[['text', 'coordinates_lat', 'coordinates_long', 'tweet_city']]    # Keep only relevant columns

tweets_df = pd.read_csv(FILE_TRAIN, sep=',')

# Swap latitude and longitude column values; we made a mistake when formatting the dataset to a csv file
tweets_df.rename(columns={'coordinates_lat' : 'coordinates_lat_old'}, inplace=True)
tweets_df.rename(columns={'coordinates_long' : 'coordinates_lat'}, inplace=True)
tweets_df.rename(columns={'coordinates_lat_old' : 'coordinates_long'}, inplace=True)

tweets_df = tweets_df[['id', 'text', 'coordinates_lat', 'coordinates_long', 'tweet_city']]    # Keep only relevant columns

In [4]:
# Text preprocessing
import nltk
import re
import numpy as np


def preprocess(tweet):
    if not tweet:
        return ''
    try:
        tweet = re.sub(r'http\S+', '', tweet)    # Remove urls
        tokenizer = nltk.RegexpTokenizer(r'\w+')    # Remove special characters
        tokens = tokenizer.tokenize(tweet)
    except:
        print('error: {}'.format(tweet))
        return ''

    if len(tokens) < 3:    # Ignore tweets with less than 3 words  
        return ''
        
    return ' '.join(tokens)
 

def get_preprocessed_df(df):
    df['text'] = df['text'].apply(lambda text: preprocess(text))   
    df = df.loc[df['text'].apply(lambda t: t.strip() != '')]
    df = df.reset_index(drop=True)
    return df

print('Preprocessing text')
tweets_df = get_preprocessed_df(tweets_df)
test_tweets_df = get_preprocessed_df(test_tweets_df)

Preprocessing text


In [6]:
# Creating labels for classification using clustering (manually setting num_clusters)

from sklearn.cluster import KMeans
import numpy as np

NUM_CLUSTERS = 10
coordinates_arr = tweets_df.as_matrix(columns=['coordinates_long', 'coordinates_lat'])

print('Clustering with {} means'.format(NUM_CLUSTERS))
kmeans = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++')
tweets_df['cluster_label'] = kmeans.fit_predict(coordinates_arr)
cluster_centers = kmeans.cluster_centers_

Clustering with 3 means


  import sys


In [8]:
USE_EMBEDDINGS = False    # Configure word embeddings based features or tf-idf based    
NUM_FEATURES = 300


def get_tweet_vector(tweet_row, model, vocab):
    # Get vector for tweet by computing average for all tokens weighted by their tf-idf values
    
    num_tokens = 1
    tweet_vec = np.zeros(NUM_FEATURES)
    
    for i in range(len(vocab)):
        token = vocab[i]
        token_val = tweet_row[i]    # tf-idf value for feature corresponding to the token
        
        if token not in model.vocab or token_val == 0.0:
            continue
            
        token_vec = model.get_vector(token)         
        tweet_vec += token_val * token_vec
        num_tokens += 1
        
    return tweet_vec / num_tokens
            
                    
        
def get_embedding_features(tf_idf_features, model, vocab):
    # Get tweet vector for each tweet in the dataset
    
    embed_features = list()
    
    for tweet_row in tf_idf_features:
        embed_features.append(get_tweet_vector(tweet_row, model, vocab))
        
    return np.array(embed_features)

In [9]:
from gensim.models import KeyedVectors

model = None

if USE_EMBEDDINGS:
    print('Loading embedding model')
    word_embeddings_file = '../../../text_rewriting/arnet_miner_analysis/wiki.en.vec'
    model = KeyedVectors.load_word2vec_format(word_embeddings_file)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score


def report_result(test_labels, predicted_labels):
    print('Accuracy: ', accuracy_score(y_true=test_labels, y_pred=predicted_labels))
    print('Macro f1 score: ', f1_score(y_true=test_labels, y_pred=predicted_labels, average='macro'))
    print('Micro f1 score: ', f1_score(y_true=test_labels, y_pred=predicted_labels, average='micro'))


TRAIN_SIZE = 100000    # We experiment 100k data points due to memory limitations

train_df = tweets_df.head(TRAIN_SIZE)
test_df = test_tweets_df

vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')    # Stop words filtered

print('Extracting tf-idf features')
train_features = vectorizer.fit_transform(train_df['text']).toarray()    
test_features = vectorizer.transform(test_df['text']).toarray()

if USE_EMBEDDINGS:
    print('Computing word vector based features')    
    vocab = vectorizer.get_feature_names()    
    assert len(vocab) == len(train_features[0])
    assert len(vocab) == len(test_features[0])    
    train_tf_idf_shape = train_features.shape
    test_tf_idf_shape = test_features.shape
    
    train_features = get_embedding_features(train_features, model, vocab)
    test_features = get_embedding_features(test_features, model, vocab)
    
    assert train_features.shape[0] == train_tf_idf_shape[0]
    assert test_features.shape[0] == test_tf_idf_shape[0]
    

# Predicting latitude and longitude

train_labels = train_df['cluster_label']

classifier = LogisticRegression()

# classifier = GaussianNB()

# classifier = SVC()

print('Training and predicting lat and long using classifier: {}'.format(classifier))
classifier.fit(train_features, train_labels)
test_df['predicted_cluster_label'] = classifier.predict(test_features)

test_df['predicted_long'] = test_df['predicted_cluster_label'].apply(lambda l: cluster_centers[int(l)][0])
test_df['predicted_lat'] = test_df['predicted_cluster_label'].apply(lambda l: cluster_centers[int(l)][1])

Extracting tf-idf features
Training and predicting lat and long using classifier: LogisticRegression()


In [11]:
# Save predicted labels for tweets so we don't have to cluster and train again

# SAVE_PREDICTED_DFS = False    # Change this during runtime

# if SAVE_PREDICTED_DFS:
#     try:
#         save_csv_file_name = 'test_{}_clusters_nb_embedding_{}.csv'.format(NUM_CLUSTERS, USE_EMBEDDINGS)
#         test_df.to_csv('dataset_with_predictions/' + save_csv_file_name, index=False)

#         save_csv_file_name = 'train_{}_clusters_nb_embedding_{}.csv'.format(NUM_CLUSTERS, USE_EMBEDDINGS)
#         train_df.to_csv('dataset_with_predictions/' + save_csv_file_name, index=False)

#         print('Saved predicted dataset files')
#     except Exception as e:
#         print('Encountered exception {} when saving df files'.format(e))

In [12]:
train_labels = train_df['tweet_city']
test_labels = test_df['tweet_city']

In [13]:
def predict_most_common_city_by_cluster():
    cluster_city = dict()    # Most frequently occurring city for each cluster

    for cluster_label, df in train_df.groupby('cluster_label'):
        most_freq_city = df['tweet_city'].mode()[0]
        cluster_city[cluster_label] = most_freq_city
    
    try:
        test_df['predicted_city'] = test_df['predicted_cluster_label'].apply(lambda l: cluster_city[l])    
    except Exception as e:
        print('Encountered error, predicting most common city: {}'.format(e))

In [14]:
predict_most_common_city_by_cluster()

print('Results for city prediction by coordinates')
report_result(test_labels, test_df['predicted_city'])

Results for city prediction by coordinates
Accuracy:  0.0
Macro f1 score:  0.0
Micro f1 score:  0.0


In [15]:
test_df['id'] = test_df.index

output_df = test_df[['id', 'predicted_city', 'predicted_lat', 'predicted_long']]
oracle_df = test_df[['id', 'tweet_city', 'coordinates_lat', 'coordinates_long']]

output_data = list(output_df.to_records(index=False))
oracle_data = list(oracle_df.to_records(index=False))

In [16]:
# Official evaluation for the shared task - WNUT

# !{sys.executable} -m pip install ujson

# import ujson as json
import math

EARTH_RADIUS = 6372.8


def _calc_dist_radian(pLat, pLon, lat, lon):
    """
    Calculate the Great Circle Distance between two points on earth
    http://en.wikipedia.org/wiki/Great-circle_distance
    """
    cos_pLat = math.cos(pLat)
    sin_pLat = math.sin(pLat)
    cos_lat = math.cos(lat)
    sin_lat = math.sin(lat)
    long_diff = pLon - lon
    cos_long_diff = math.cos(long_diff)
    sin_long_diff = math.sin(long_diff)
    numerator = math.sqrt(math.pow(cos_lat * sin_long_diff, 2) +
                          math.pow(cos_pLat * sin_lat - sin_pLat * cos_lat * cos_long_diff, 2))
    denominator = sin_pLat * sin_lat + cos_pLat * cos_lat * cos_long_diff
    radian = math.atan2(numerator, denominator)
    return radian * EARTH_RADIUS


def _degree_radian(degree):
    return (degree * math.pi) / 180


def calc_dist_degree(pLat, pLon, lat, lon):
    pLat = _degree_radian(pLat)
    pLon = _degree_radian(pLon)
    lat = _degree_radian(lat)
    lon = _degree_radian(lon)
    return _calc_dist_radian(pLat, pLon, lat, lon)


def evaluate_submission(output_data, oracle_data):
#     output_data = etl_data(output_file, submission_type)
#     oracle_data = etl_data(oracle_file, submission_type)
    assert len(output_data) == len(oracle_data)
    accuracy, median_error, mean_error = 0.0, 0.0, 0.0
    right, wrong = 0, 0
    error_distance_list = []
    for output, oracle in zip(output_data, oracle_data):        
        assert output[0] == oracle[0]
        if output[1] == oracle[1]:
            right += 1
        else:
            wrong += 1
        error_distance = calc_dist_degree(output[2], output[3], oracle[2], oracle[3])
        error_distance_list.append(error_distance)

    accuracy = round(right / (right + wrong + 1e-6), 4)
    error_distance_list.sort()
    total_num = len(error_distance_list)
        
    mean_error = round(sum(error_distance_list) / (total_num + 1e-6), 4)
    median_error = round(error_distance_list[int(total_num / 2)], 4)
    
    
    output_file = ''
    submission_type = ''
    
    result = "WNUT evaluation: {}, {}, {}, {}, {}".format(output_file,
                                                  submission_type,
                                                  accuracy,
                                                  median_error,
                                                  mean_error)
    print(result)
    

    
print('Evaluating result')
evaluate_submission(output_data, oracle_data)


Evaluating result
WNUT evaluation: , , 0.0, 8758.0217, 8669.2916


In [58]:
print('Finished')

Finished
