In [82]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle

%matplotlib inline

FILE_TEST = 'formatted_csv_dataset/formatted_test.tweet.csv'
FILE_TRAIN = 'formatted_csv_dataset/formatted_training.csv'

test_tweets_df = pd.read_csv(FILE_TEST, sep=',')
test_tweets_df = test_tweets_df[['text', 'coordinates_lat', 'coordinates_long', 'tweet_city']]    # Keep only relevant columns

tweets_df = pd.read_csv(FILE_TRAIN, sep=',')

# Swap latitude and longitude column values; we made a mistake when formatting the dataset to a csv file
tweets_df.rename(columns={'coordinates_lat' : 'coordinates_lat_old'}, inplace=True)
tweets_df.rename(columns={'coordinates_long' : 'coordinates_lat'}, inplace=True)
tweets_df.rename(columns={'coordinates_lat_old' : 'coordinates_long'}, inplace=True)

tweets_df = tweets_df[['id', 'text', 'coordinates_lat', 'coordinates_long', 'tweet_city']]    # Keep only relevant columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [83]:
# Text preprocessing
import nltk
import re
import numpy as np

def preprocess(tweet):
    if not tweet:
        return ''
    try:
        tweet = re.sub(r'http\S+', '', tweet)    # Remove urls
        tokenizer = nltk.RegexpTokenizer(r'\w+')    # Remove special characters
        tokens = tokenizer.tokenize(tweet)
    except:
        print('error: {}'.format(tweet))
        return ''

    if len(tokens) < 3:    # Ignore tweets with less than 3 words  
        return ''
        
    return ' '.join(tokens)


def get_preprocessed_df(df):
    df['text'] = df['text'].apply(lambda text: preprocess(text))   
    df = df.loc[df['text'].apply(lambda t: t.strip() != '')]
    df = df.reset_index(drop=True)
    return df

print('Preprocessing text')
tweets_df = get_preprocessed_df(tweets_df)
test_tweets_df = get_preprocessed_df(test_tweets_df)

Preprocessing text


In [86]:
USE_EMBEDDINGS = False    # Configure word embeddings based features or tf-idf based    
NUM_FEATURES = 300


def get_tweet_vector(tweet_row, model, vocab):
    num_tokens = 1
    tweet_vec = np.zeros(NUM_FEATURES)
    
    for i in range(len(vocab)):
        token = vocab[i]
        token_val = tweet_row[i]    # tf-idf value for feature corresponding to the token
        
        if token not in model.vocab or token_val == 0.0:
            continue
            
        token_vec = model.get_vector(token)         
        tweet_vec += token_val * token_vec
        num_tokens += 1
        
    return tweet_vec / num_tokens
            
                    
        
def get_embedding_features(tf_idf_features, model, vocab):
    embed_features = list()
    
    for tweet_row in tf_idf_features:
        embed_features.append(get_tweet_vector(tweet_row, model, vocab))
        
    return np.array(embed_features)

In [87]:
from gensim.models import KeyedVectors

model = None

if USE_EMBEDDINGS:
    print('loading embedding model')
    word_embeddings_file = '../../../text_rewriting/arnet_miner_analysis/wiki.en.vec'
    model = KeyedVectors.load_word2vec_format(word_embeddings_file)

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score


def report_result(test_labels, predicted_labels):
    print('Accuracy: ', accuracy_score(y_true=test_labels, y_pred=predicted_labels))
    print('Macro f1 score: ', f1_score(y_true=test_labels, y_pred=predicted_labels, average='macro'))
    print('Micro f1 score: ', f1_score(y_true=test_labels, y_pred=predicted_labels, average='micro'))


TRAIN_SIZE = 100000

train_df = tweets_df.head(TRAIN_SIZE)
test_df = test_tweets_df

vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')    # Stop words filtered

print('Extracting tf-idf features')
train_features = vectorizer.fit_transform(train_df['text']).toarray()    
test_features = vectorizer.transform(test_df['text']).toarray()

if USE_EMBEDDINGS:
    print('Computing word vector based features')    
    vocab = vectorizer.get_feature_names()    
    assert len(vocab) == len(train_features[0])
    assert len(vocab) == len(test_features[0])    
    train_tf_idf_shape = train_features.shape
    test_tf_idf_shape = test_features.shape
    
    train_features = get_embedding_features(train_features, model, vocab)
    test_features = get_embedding_features(test_features, model, vocab)
    
    assert train_features.shape[0] == train_tf_idf_shape[0]
    assert test_features.shape[0] == test_tf_idf_shape[0]
    

# Predicting latitude and longitude

train_labels = train_df['tweet_city']

# classifier = LogisticRegression()

classifier = GaussianNB()

# classifier = SVC()

print('Training and predicting city using classifier: {}'.format(classifier))
classifier.fit(train_features, train_labels)
test_df['predicted_city'] = classifier.predict(test_features)

# We assign arbitrary values to coordinates since we're only interested in city prediction accuracy
test_df['predicted_long'] = 0.0    
test_df['predicted_lat'] = 0.0

# report_result(test_labels, predicted_labels)

Extracting tf-idf features
Training and predicting lat and long using classifier: GaussianNB()


In [163]:
test_df['id'] = test_df.index

output_df = test_df[['id', 'predicted_city', 'predicted_lat', 'predicted_long']]
oracle_df = test_df[['id', 'tweet_city', 'coordinates_lat', 'coordinates_long']]

output_data = list(output_df.to_records(index=False))
oracle_data = list(oracle_df.to_records(index=False))

In [164]:
# Official evaluation for the shared task - WNUT

# !{sys.executable} -m pip install ujson

# import ujson as json
import math

EARTH_RADIUS = 6372.8


def _calc_dist_radian(pLat, pLon, lat, lon):
    """
    Calculate the Great Circle Distance between two points on earth
    http://en.wikipedia.org/wiki/Great-circle_distance
    """
    cos_pLat = math.cos(pLat)
    sin_pLat = math.sin(pLat)
    cos_lat = math.cos(lat)
    sin_lat = math.sin(lat)
    long_diff = pLon - lon
    cos_long_diff = math.cos(long_diff)
    sin_long_diff = math.sin(long_diff)
    numerator = math.sqrt(math.pow(cos_lat * sin_long_diff, 2) +
                          math.pow(cos_pLat * sin_lat - sin_pLat * cos_lat * cos_long_diff, 2))
    denominator = sin_pLat * sin_lat + cos_pLat * cos_lat * cos_long_diff
    radian = math.atan2(numerator, denominator)
    return radian * EARTH_RADIUS


def _degree_radian(degree):
    return (degree * math.pi) / 180


def calc_dist_degree(pLat, pLon, lat, lon):
    pLat = _degree_radian(pLat)
    pLon = _degree_radian(pLon)
    lat = _degree_radian(lat)
    lon = _degree_radian(lon)
    return _calc_dist_radian(pLat, pLon, lat, lon)


def evaluate_submission(output_data, oracle_data):
#     output_data = etl_data(output_file, submission_type)
#     oracle_data = etl_data(oracle_file, submission_type)
    assert len(output_data) == len(oracle_data)
    accuracy, median_error, mean_error = 0.0, 0.0, 0.0
    right, wrong = 0, 0
    error_distance_list = []
    for output, oracle in zip(output_data, oracle_data):        
        assert output[0] == oracle[0]
        if output[1] == oracle[1]:
            right += 1
        else:
            wrong += 1
        error_distance = calc_dist_degree(output[2], output[3], oracle[2], oracle[3])
        error_distance_list.append(error_distance)

    accuracy = round(right / (right + wrong + 1e-6), 4)
    error_distance_list.sort()
    total_num = len(error_distance_list)
        
    mean_error = round(sum(error_distance_list) / (total_num + 1e-6), 4)
    median_error = round(error_distance_list[int(total_num / 2)], 4)
    
    
    output_file = ''
    submission_type = ''
    
    result = "WNUT evaluation: {}, {}, {}, {}, {}".format(output_file,
                                                  submission_type,
                                                  accuracy,
                                                  median_error,
                                                  mean_error)
    print(result)
    

    
print('Evaluating result')
evaluate_submission(output_data, oracle_data)


Evaluating result
WNUT evaluation: , , 0.0297, 7776.0522, 7405.35


In [58]:
print('Finished')

Finished
