### Project - Heuristic and Learning based approaches for Twitter Geolocation
### Notebook for Geolocation Prediction in Twitter – SS 2020
#### Vishal Khanna-Mat. 120333-CS4DM
#### Sneha Mohanty-Mat. 120799-CS4DM
#### Sagar Nagaraj SimhaMat. 120797, CS4DM

### Alternate experimental setting with pipeline of Stemming, TFIDF vectorization and SCG Classifier with 100 clusters.
#### Results: Accuracy - 12.04%, Median error distance - 1271km, Mean error distance - 4004km

In [2]:
#Imports
import pandas as pd
import pickle

# Reading from the file and selecting the relevant columns
FILE_TEST = 'formatted_test.tweet.csv'
FILE_TRAIN = 'formatted_training.csv'

test_tweets_df = pd.read_csv(FILE_TEST, sep=',')
test_tweets_df = test_tweets_df[['text', 'coordinates_lat', 'coordinates_long', 'tweet_city']]    # Keep only relevant columns

tweets_df = pd.read_csv(FILE_TRAIN, sep=',') # The training dataset

# Swap latitude and longitude column values; we made a mistake when formatting the dataset to a csv file
tweets_df.rename(columns={'coordinates_lat' : 'coordinates_lat_old'}, inplace=True)
tweets_df.rename(columns={'coordinates_long' : 'coordinates_lat'}, inplace=True)
tweets_df.rename(columns={'coordinates_lat_old' : 'coordinates_long'}, inplace=True)

tweets_df = tweets_df[['id', 'text', 'coordinates_lat', 'coordinates_long', 'tweet_city', 'lang']]    # Keep only relevant columns

In [3]:
# Text preprocessing. Remove URLs and special characters and ignoring tweets lesser than 3 tokens
import nltk
import re
import numpy as np

def preprocess(tweet):
    if not tweet:
        return ''
    try:
        tweet = re.sub(r'http\S+', '', tweet)    # Remove urls
        tokenizer = nltk.RegexpTokenizer(r'\w+')    # Remove special characters
        tokens = tokenizer.tokenize(tweet)
    except:
        print('error: {}'.format(tweet))
        return ''

    if len(tokens) < 3:    # Ignore tweets with less than 3 words  
        return ''
        
    return ' '.join(tokens)


def get_preprocessed_df(df):
    df['text'] = df['text'].apply(lambda text: preprocess(text))   
    df = df.loc[df['text'].apply(lambda t: t.strip() != '')]
    df = df.reset_index(drop=True)
    return df

print('Preprocessing text')
tweets_df = get_preprocessed_df(tweets_df)
test_tweets_df = get_preprocessed_df(test_tweets_df)

Preprocessing text
error: nan
error: nan


In [4]:
# Creating labels for classification by clustering of cordinate points-lat,long (setting num_clusters based on heuristics of number of cities/districts etc)
from sklearn.cluster import MiniBatchKMeans
NUM_CLUSTERS = 100
mbk = MiniBatchKMeans(init ='k-means++', n_clusters = NUM_CLUSTERS, 
                      batch_size = 200, 
                      max_no_improvement = 10, verbose = 0) 

points = list(zip(tweets_df.coordinates_lat, tweets_df.coordinates_long))
coordinates_arr = np.array(points)
#coordinates_arr = tweets_df[['coordinates_lat', 'coordinates_long']].values

print('Clustering with {} means'.format(NUM_CLUSTERS))
mbk.fit(coordinates_arr)
tweets_df['cluster_label'] = mbk.labels_
cluster_centers = mbk.cluster_centers_

Clustering with 100 means


In [5]:
#Number of datapoints in each cluster label. This is to show the skewed distribution of cities.
tweets_df.groupby(['cluster_label']).size()

cluster_label
0      26485
1     319661
2     302348
3     269345
4      40248
       ...  
95     60601
96      8064
97     52009
98      5984
99     70102
Length: 100, dtype: int64

In [7]:
# Methods to blancing the dataset using Downsampler, Upsampler and SMOTE. Downsampler looses information.
# SMOTE and oversamper requires online learning mechanism to run over large datasets.
# from imblearn.under_sampling import RandomUnderSampler
# rus = RandomUnderSampler(random_state=42)
# X_res, y_res = rus.fit_resample(tweets_df, tweets_df['cluster_label'])
# X_res.groupby(['cluster_label']).size()

In [6]:
## Pipeline - Stemming, TFIDF vectorization and SGD Classifier

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

# Stemming Code

import nltk

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

# text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer(smooth_idf=True,use_idf=True)), 
#                              ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer(smooth_idf=True,use_idf=True)), 
                             ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42))])

text_mnb_stemmed = text_mnb_stemmed.fit(tweets_df['text'], tweets_df['cluster_label'])

predicted_mnb_stemmed = text_mnb_stemmed.predict(test_tweets_df['text'])

#np.mean(predicted_mnb_stemmed == test_tweets_df['cluster_label'])

In [7]:
# Accuracy on training data
predicted_mnb_stemmed_train = text_mnb_stemmed.predict(tweets_df['text'])
print('The accuracy on training data is', np.mean(predicted_mnb_stemmed_train == tweets_df['cluster_label']))

The accuracy on training data is 0.5598479992426193


In [8]:
# Predicting the cluster label on test dataset
test_df = test_tweets_df
test_df['predicted_cluster_label'] = ''
test_df['predicted_cluster_label'] = text_mnb_stemmed.predict(test_df['text'])

# Cordinate prediction of the test text. Mapping the cluster label to the cluster center to assign the latitude and longitude to the text
test_df['predicted_long'] = test_df['predicted_cluster_label'].apply(lambda l: cluster_centers[int(l)][1])
test_df['predicted_lat'] = test_df['predicted_cluster_label'].apply(lambda l: cluster_centers[int(l)][0])

In [9]:
# City prediction. Mapping the most commonly occuring city label in the cluster to the text
def predict_most_common_city_by_cluster():
    cluster_city = dict()    # Most frequently occurring city for each cluster

    for cluster_label, df in tweets_df.groupby('cluster_label'):
        cluster_city[cluster_label] = df['tweet_city'].mode()[0]

    return cluster_city
cluster_city = predict_most_common_city_by_cluster()
test_df['predicted_city'] = test_df['predicted_cluster_label']
test_df['predicted_city'] = test_df['predicted_city'].map(cluster_city)

#print('Results for city prediction by coordinates')
#report_result(test_labels, test_df['predicted_city'])

In [10]:
# Arranging the prediction and oracle data in formats compatible with the evaluation script.
test_df['id'] = test_df.index

output_df = test_df[['id', 'predicted_city', 'predicted_lat', 'predicted_long']]
oracle_df = test_df[['id', 'tweet_city', 'coordinates_lat', 'coordinates_long']]

output_data = list(output_df.to_records(index=False))
oracle_data = list(oracle_df.to_records(index=False))

In [11]:
# Evaluation WNUT

# !{sys.executable} -m pip install ujson

# import ujson as json
import math

EARTH_RADIUS = 6372.8


def _calc_dist_radian(pLat, pLon, lat, lon):
    """
    Calculate the Great Circle Distance between two points on earth
    http://en.wikipedia.org/wiki/Great-circle_distance
    """
    cos_pLat = math.cos(pLat)
    sin_pLat = math.sin(pLat)
    cos_lat = math.cos(lat)
    sin_lat = math.sin(lat)
    long_diff = pLon - lon
    cos_long_diff = math.cos(long_diff)
    sin_long_diff = math.sin(long_diff)
    numerator = math.sqrt(math.pow(cos_lat * sin_long_diff, 2) +
                          math.pow(cos_pLat * sin_lat - sin_pLat * cos_lat * cos_long_diff, 2))
    denominator = sin_pLat * sin_lat + cos_pLat * cos_lat * cos_long_diff
    radian = math.atan2(numerator, denominator)
    return radian * EARTH_RADIUS


def _degree_radian(degree):
    return (degree * math.pi) / 180


def calc_dist_degree(pLat, pLon, lat, lon):
    pLat = _degree_radian(pLat)
    pLon = _degree_radian(pLon)
    lat = _degree_radian(lat)
    lon = _degree_radian(lon)
    return _calc_dist_radian(pLat, pLon, lat, lon)


def evaluate_submission(output_data, oracle_data):
#     output_data = etl_data(output_file, submission_type)
#     oracle_data = etl_data(oracle_file, submission_type)
    assert len(output_data) == len(oracle_data)
    accuracy, median_error, mean_error = 0.0, 0.0, 0.0
    right, wrong = 0, 0
    error_distance_list = []
    for output, oracle in zip(output_data, oracle_data):        
        assert output[0] == oracle[0]
        if output[1] == oracle[1]:
            right += 1
        else:
            wrong += 1
        error_distance = calc_dist_degree(output[2], output[3], oracle[2], oracle[3])
        error_distance_list.append(error_distance)

    accuracy = round(right / (right + wrong + 1e-6), 4)
    error_distance_list.sort()
    total_num = len(error_distance_list)
        
    mean_error = round(sum(error_distance_list) / (total_num + 1e-6), 4)
    median_error = round(error_distance_list[int(total_num / 2)], 4)
    
    
    output_file = ''
    submission_type = ''
    
    result = "WNUT evaluation: {}, {}, {}, {}, {}".format(output_file,
                                                  submission_type,
                                                  accuracy,
                                                  median_error,
                                                  mean_error)
    print(result)
    

    
print('Evaluating result')
evaluate_submission(output_data, oracle_data)


Evaluating result
WNUT evaluation: , , 0.1247, 1324.2263, 4067.1548


In [12]:
print('Finished')

Finished
