In [114]:
import pandas as pd                 # pandas is a dataframe library
import matplotlib.pyplot as plt     # matplotlib.pyplot plots data
import numpy as np                  # numpy provides N-dim object support

# do ploting inline instead of in a separate window
%matplotlib inline

In [4]:
#READ AND PARSE AIRLINE FILE
airline_tweets = pd.read_csv('airline_tweets.csv', usecols=['airline_sentiment','tweet_coord'],encoding='utf-8')
airline_tweets=airline_tweets.dropna()
airline_tweets=airline_tweets.loc[airline_tweets['tweet_coord'] != '[0.0, 0.0]']
sentiment_map = { 'negative' : 0,'neutral' :1,'positive' : 2}
airline_tweets['airline_sentiment'] = airline_tweets['airline_sentiment'].map(sentiment_map)
airline_tweets['tweet_coord'].replace(regex=True,inplace=True,to_replace=r'\[',value=r'')
airline_tweets['tweet_coord'].replace(regex=True,inplace=True,to_replace=r'\]',value=r'')
airline_tweets = airline_tweets.reset_index(drop=True)

split_c = airline_tweets['tweet_coord'].apply(lambda x: x.split(','))
airline_tweets['latitude'] = split_c.apply(lambda x: float(x[0]))
airline_tweets['longitude'] = split_c.apply(lambda x: float(x[1]))
airline_tweets.drop(['tweet_coord'],axis=1)

#READ AND PARSE CITIES
cities = pd.read_csv( "cities.csv", delimiter="," ,header='infer',usecols=['name','latitude','longitude'],encoding='utf-8')
cities=cities.dropna()
cities['city_coord']= cities[['latitude','longitude']].values.tolist()

In [5]:
airline_tweets[0:5]

Unnamed: 0,airline_sentiment,tweet_coord,latitude,longitude
0,2,"40.74804263, -73.99295302",40.748043,-73.992953
1,0,"42.361016, -71.02000488",42.361016,-71.020005
2,1,"33.94540417, -118.4062472",33.945404,-118.406247
3,0,"33.94209449, -118.40410103",33.942094,-118.404101
4,2,"33.2145038, -96.9321504",33.214504,-96.93215


In [6]:
cities[0:5]

Unnamed: 0,name,latitude,longitude,city_coord
0,les Escaldes,42.50729,1.53414,"[42.507290000000005, 1.53414]"
1,Andorra la Vella,42.50779,1.52109,"[42.50779, 1.52109]"
2,Umm al Qaywayn,25.56473,55.55517,"[25.564729999999997, 55.55517]"
3,Ras al-Khaimah,25.78953,55.9432,"[25.78953, 55.9432]"
4,Khawr Fakkān,25.33132,56.34199,"[25.33132, 56.34199]"


In [52]:
# find the nearest city to the tweet coorinates usin ckdtree(nearest neighbour) and query(Euclidean)
from scipy.spatial import cKDTree

def closest_pts(setA_lat, setA_lng, setB_lat, setB_lng):
    a_x = setA_lat.values
    a_y = setA_lng.values
    b_x = setB_lat.values
    b_y = setB_lng.values
    #print(a_x[0:5])
    a = np.c_[a_x, a_y]
    #print(a)
    b = np.c_[b_x, b_y]
    indx = cKDTree(b).query(a,k=1,p=2)[1]
    #print(indx)
    return pd.Series(b_x[indx]), pd.Series(b_y[indx]),pd.Series(cities['name'][indx])

In [53]:
setA_lat = airline_tweets['latitude']
setA_lng =  airline_tweets['longitude']

setB_lat = cities['latitude']
setB_lng =  cities['longitude']

In [54]:
c_x,c_y,c_n = closest_pts(setA_lat, setA_lng, setB_lat, setB_lng)

In [55]:
c_n[0:5]

21569    New York City
21062          Chelsea
22020       El Segundo
22020       El Segundo
20796           Frisco
Name: name, dtype: object

In [56]:
c_n = c_n.reset_index(drop=True)
c_n[0:5]

0    New York City
1          Chelsea
2       El Segundo
3       El Segundo
4           Frisco
Name: name, dtype: object

In [57]:
airline_tweets['nearest_latitude'] = c_x
airline_tweets['nearest_longitude'] = c_y
airline_tweets['nearest_city'] =c_n

In [59]:
airline_tweets[0:100]

Unnamed: 0,airline_sentiment,tweet_coord,latitude,longitude,nearest_latitude,nearest_longitude,nearest_city
0,2,"40.74804263, -73.99295302",40.748043,-73.992953,40.71427,-74.00597,New York City
1,0,"42.361016, -71.02000488",42.361016,-71.020005,42.39176,-71.03283,Chelsea
2,1,"33.94540417, -118.4062472",33.945404,-118.406247,33.91918,-118.41647,El Segundo
3,0,"33.94209449, -118.40410103",33.942094,-118.404101,33.91918,-118.41647,El Segundo
4,2,"33.2145038, -96.9321504",33.214504,-96.932150,33.15067,-96.82361,Frisco
5,1,"34.0219817, -118.38591198",34.021982,-118.385912,34.02112,-118.39647,Culver City
6,1,"33.57963333, -117.73024772",33.579633,-117.730248,33.56504,-117.72712,Aliso Viejo
7,0,"40.6413712, -73.78311558",40.641371,-73.783116,40.66312,-73.76221,Springfield Gardens
8,2,"36.08457854, -115.13780136",36.084579,-115.137801,36.09719,-115.14666,Paradise
9,1,"37.79374402, -122.39327564",37.793744,-122.393276,37.77493,-122.41942,San Francisco


In [303]:
feature_df = airline_tweets[['nearest_city']]

In [304]:
label_df = airline_tweets['airline_sentiment']

In [305]:
type(feature_df)

pandas.core.frame.DataFrame

In [307]:
type(label_df)

pandas.core.series.Series

In [318]:
features = pd.get_dummies(feature_df, columns=["nearest_city"])
features[0:5]

Unnamed: 0,nearest_city_Addison,nearest_city_Aldine,nearest_city_Aliso Viejo,nearest_city_Allen,nearest_city_American Fork,nearest_city_Anaheim,nearest_city_Angeles City,nearest_city_Annapolis,nearest_city_Arbutus,nearest_city_Ashford,...,nearest_city_Whitestone,nearest_city_Williamsport,nearest_city_Willow Grove,nearest_city_Windsor,nearest_city_Winnipeg,nearest_city_Winter Park,nearest_city_Wolverhampton,nearest_city_Woodstock,nearest_city_Xiuying,nearest_city_Zionsville
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [319]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(label_df.values)
print(list(le.classes_))
labels = le.transform(label_df)
labels[0:5]
#le.inverse_transform([0, 0, 1, 2])
#labels =  pd.get_dummies(label_df)
#labels[0:5]

[0, 1, 2]


array([2, 0, 1, 0, 2])

In [320]:
features.shape

(855, 291)

In [321]:
labels.shape

(855,)

In [322]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
lr_model =LogisticRegression(C=0.99, random_state=42)
train_features =  features
train_labels =  labels
lr_model.fit(train_features, train_labels)
lr_predict_class = lr_model.predict(train_features)

In [325]:
# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(train_labels, lr_predict_class)))
print(metrics.confusion_matrix(train_labels, lr_predict_class, labels=[0,1,2]) )
#print("")
print("Classification Report")
print(metrics.classification_report(train_labels, lr_predict_class, labels=[0,1,2]))

Accuracy: 0.6807
[[573   0   0]
 [129   3   0]
 [144   0   6]]
Classification Report
             precision    recall  f1-score   support

          0       0.68      1.00      0.81       573
          1       1.00      0.02      0.04       132
          2       1.00      0.04      0.08       150

avg / total       0.78      0.68      0.56       855

