In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [5]:
# Load estimated coordinates values 

train_file = 'train_10_clusters_lr_embedding_True.csv'
train_df = pd.read_csv('dataset_with_predictions/final/' + train_file)

test_file = 'test_10_clusters_lr_embedding_True.csv'
test_df = pd.read_csv('dataset_with_predictions/final/' + test_file)

test_df.head(1)

Unnamed: 0,text,coordinates_lat,coordinates_long,tweet_city,predicted_cluster_label,predicted_long,predicted_lat
0,I got 99 problems but this dick aint one,28.980133,-80.909141,palm bay-fl009-us,8,-2.408918,53.786732


In [6]:
# Official evaluation for shared task WNUT

# !{sys.executable} -m pip install ujson

# import ujson as json
import math

EARTH_RADIUS = 6372.8


def _calc_dist_radian(pLat, pLon, lat, lon):
    """
    Calculate the Great Circle Distance between two points on earth
    http://en.wikipedia.org/wiki/Great-circle_distance
    """
    cos_pLat = math.cos(pLat)
    sin_pLat = math.sin(pLat)
    cos_lat = math.cos(lat)
    sin_lat = math.sin(lat)
    long_diff = pLon - lon
    cos_long_diff = math.cos(long_diff)
    sin_long_diff = math.sin(long_diff)
    numerator = math.sqrt(math.pow(cos_lat * sin_long_diff, 2) +
                          math.pow(cos_pLat * sin_lat - sin_pLat * cos_lat * cos_long_diff, 2))
    denominator = sin_pLat * sin_lat + cos_pLat * cos_lat * cos_long_diff
    radian = math.atan2(numerator, denominator)
    return radian * EARTH_RADIUS


def _degree_radian(degree):
    return (degree * math.pi) / 180


def calc_dist_degree(pLat, pLon, lat, lon):
    pLat = _degree_radian(pLat)
    pLon = _degree_radian(pLon)
    lat = _degree_radian(lat)
    lon = _degree_radian(lon)
    return _calc_dist_radian(pLat, pLon, lat, lon)


def evaluate_submission(output_data, oracle_data):
#     output_data = etl_data(output_file, submission_type)
#     oracle_data = etl_data(oracle_file, submission_type)
    assert len(output_data) == len(oracle_data)
    accuracy, median_error, mean_error = 0.0, 0.0, 0.0
    right, wrong = 0, 0
    error_distance_list = []
    for output, oracle in zip(output_data, oracle_data):        
        assert output[0] == oracle[0]
        if output[1] == oracle[1]:
            right += 1
        else:
            wrong += 1
        error_distance = calc_dist_degree(output[2], output[3], oracle[2], oracle[3])
        error_distance_list.append(error_distance)

    accuracy = round(right / (right + wrong + 1e-6), 4)
    error_distance_list.sort()
    total_num = len(error_distance_list)
        
    mean_error = round(sum(error_distance_list) / (total_num + 1e-6), 4)
    median_error = round(error_distance_list[int(total_num / 2)], 4)
    
    
    output_file = ''
    submission_type = ''
    
    result = "WNUT evaluation: {}, {}, {}, {}, {}".format(output_file,
                                                  submission_type,
                                                  accuracy,
                                                  median_error,
                                                  mean_error)
    print(result)

In [8]:
def show_result_report():
    test_df['id'] = test_df.index

    output_df = test_df[['id', 'predicted_city', 'predicted_lat', 'predicted_long']]
    oracle_df = test_df[['id', 'tweet_city', 'coordinates_lat', 'coordinates_long']]

    output_data = list(output_df.to_records(index=False))
    oracle_data = list(oracle_df.to_records(index=False))
    
    evaluate_submission(output_data, oracle_data)

In [9]:
def predict_most_common_city_by_cluster():
    cluster_city = dict()    # Most frequently occurring city for each cluster

    for cluster_label, df in train_df.groupby('cluster_label'):
        most_freq_city = df['tweet_city'].mode()[0]
        cluster_city[cluster_label] = most_freq_city
    
    try:
        test_df['predicted_city'] = test_df['predicted_cluster_label'].apply(lambda l: cluster_city[l])    
    except Exception as e:
        print('Encountered error, predicting most common city: {}'.format(e))
        

predict_most_common_city_by_cluster()

print('Result for predicting most common city by cluster:')
show_result_report()

Result for predicting most common city by cluster:
WNUT evaluation: , , 0.0297, 7776.0522, 7405.35


In [13]:
from sklearn.neighbors import KNeighborsClassifier


for NUM_NEIGHBOURS in (3, 5, 10, 50, 100):
    classifier = KNeighborsClassifier(n_neighbors=NUM_NEIGHBOURS)

    train_features = train_df[['coordinates_lat', 'coordinates_long']]
    train_labels = train_df['tweet_city']

    test_features = test_df[['predicted_lat', 'predicted_long']]

    print('Training and predicting city using classifier: {}'.format(classifier))
    classifier.fit(train_features, train_labels)
    test_df['predicted_city'] = classifier.predict(test_features)

    print('Result for predicting city by {} nearest neighbours:'.format(NUM_NEIGHBOURS))
    show_result_report()

Training and predicting city using classifier: KNeighborsClassifier(n_neighbors=3)
WNUT evaluation: , , 0.0, 8430.5643, 8051.9956


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


for classifier in (LogisticRegression(), GaussianNB(), SVC()):
    train_features = train_df[['coordinates_lat', 'coordinates_long']]
    train_labels = train_df['tweet_city']

    test_features = test_df[['predicted_lat', 'predicted_long']]

    print('Training and predicting city using classifier: {}'.format(classifier))
    classifier.fit(train_features, train_labels)
    test_df['predicted_city'] = classifier.predict(test_features)

    print('Result for predicting city by {}:'.format(classifier))
    show_result_report()

In [None]:
print('Finished')