In [39]:
import json
import emoji
import re
import nltk
import numpy as np
import tweets_processor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from collections import defaultdict
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

In [17]:
# parsing json file containing tweets
with open('tweets.json') as f_tweets:
    json_tweets = json.load(f_tweets)
#print(json_tweets[1]['text'])

tweets_text = [] # should we change this to numpy
tweets_place = []
for tweet in json_tweets:
    if (tweet['user']['geo_enabled'] == True):
        tweets_text.append(tweet['text'])
        tweets_place.append(tweet['place']['full_name'])

#print(tweets_text[1])
#print(len(tweets_text))
#print(len(tweets_place))

In [28]:
tweets_text, tweets_place = tweets_processor.get_tweets_from_csv()

In [3]:
s = 'test the ü§î üôà emojis, in üòå here üíïüë≠'
s = ''.join(c for c in s if c not in emoji.UNICODE_EMOJI)
print(s)

test the   emojis, in  here 


In [43]:
# vectorizing the text
## ????Add more custom parsing using nltk
#1. change to lower case
#2. stemming
#3. lemma
#4. canonicalize
#5. Remove urls
#6.remove emojis

def stem(match):
 # instantiate the PotterStemmer
 stemmer = PorterStemmer()
 # stem the word matched in the regex
 ts = stemmer.stem(match.group(1))
 return ts

def preprocessor(text):
 # convert to lower case
 text = text.lower()
 # remove urls
 text = re.sub(r'http\S+', '', text)
 #print(text)
 # remove stop words and emojis
 stop = set(stopwords.words('english'))
 text = ' '.join(word for word in nltk.word_tokenize(text) if word not in stop and word not in emoji.UNICODE_EMOJI)
 #print(text)
 # remove special characters and numbers
 text = re.sub(r'\W+', ' ', text)
 #s = re.sub('(_|\(|\))','',s)
 # removing all digits ????????????????? think about this
 text = re.sub(r'(\d+)', '', text)
 # remove emoji
 #text = ''.join(word for word in text if word not in emoji.UNICODE_EMOJI)
 # perform stemming on words
 #s = re.sub(r'(\b\w+\b)',stem,s)
 # remove two letter words
 text = re.sub(r'\b[a-z][a-z]\b','',text)
 # remove single letter 
 text = re.sub(r'\b[a-z]\b','',text)
 # remove underscores
 text = re.sub(r'_','',text)
 
 #print(text)
 return text

def tokenizer(text):
 tokens = nltk.word_tokenize(text)
 return tokens
 
#vectorizer = TfidfVectorizer(preprocessor = preprocessor, tokenizer=tokenizer)
# using bigrams
vectorizer = TfidfVectorizer(preprocessor = preprocessor, tokenizer=tokenizer,ngram_range=(2,2))
tweets_vectorized = vectorizer.fit_transform(tweets_text)

# length of total vocabulary
#print(len(vectorizer.vocabulary_))
print(tweets_vectorized.shape)


# count vectorizer
#count_vectorizer = CountVectorizer(preprocessor = preprocessor, tokenizer=tokenizer)
#using bigrams
count_vectorizer = CountVectorizer(preprocessor = preprocessor, tokenizer=tokenizer,ngram_range=(2,2))
tweets_count_vectorized = count_vectorizer.fit_transform(tweets_text)

# length of total vocabulary
print(count_vectorizer.vocabulary_)
print(count_vectorizer.get_feature_names())
#print(tweets_vectorized.shape)

(10001, 62407)


In [7]:
print(tweets_text[0], tweets_vectorized[1], vectorizer.get_feature_names()[1485], vectorizer.get_feature_names()[1452], vectorizer.get_feature_names()[2172], tweets_place[0])
vocab = vectorizer.get_feature_names()
place_word_score = defaultdict(lambda: defaultdict(lambda: 0.0))
word_place_score = defaultdict(lambda: defaultdict(lambda: 0.0))
word_score = dict()
for index, tweet in enumerate(tweets_vectorized):
    place = tweets_place[index]
    for word_index, score in zip(tweet.indices, tweet.data):
        word = vocab[word_index]
        existing_place = place_word_score.get(place, None)
        if existing_place is not None and existing_place.get(word, None) is not None:
            score = existing_place.get(word) + score
        # stores for each place what are all the words
        place_word_score[place][word] = score
        word_score_in_dict = word_score.get(word, None)
        # update the dict with the place and word only when there is no word in dict(word_score) with a score or if the score is greater for this place than the previous place
        # also update the word_score with new score
        if word_score_in_dict is None or word_score_in_dict < score:
            word_place_score[word] = {place:score}
            word_score[word] = score
print(word_place_score)

text   (0, 8924)	0.23670616541879838
  (0, 14308)	0.3338910725889262
  (0, 16218)	0.5699211122735731
  (0, 8931)	0.39683969623087334
  (0, 12274)	0.36238570578204915
  (0, 11820)	0.4678372841311734 birthright billboard callin region


In [34]:
# splitting the data into train, dev, test
#train_data, train_labels = tweets_text[:700], tweets_place[:700] # ????see if we automatically split it proportinately irrespective of size 
#dev_data, dev_labels = tweets_text[700:901], tweets_place[700:901]
#test_data, test_labels = tweets_text[901:], tweets_place[901:]
print(tweets_text[1], tweets_vectorized[1], vectorizer.get_feature_names()[31054], vectorizer.get_feature_names()[1452], vectorizer.get_feature_names()[2172], tweets_place[1])

# using train_test_split
train_data, test_data, train_labels, test_labels = train_test_split(tweets_vectorized, tweets_place, test_size=0.33, random_state=0)

#using count vectorized tweets
#train_data, test_data, train_labels, test_labels = train_test_split(tweets_count_vectorized, tweets_place, test_size=0.33, random_state=0)

When you like something on twit a lil pumpkin pops upüéÉüôà   (0, 31054)	0.4548641685301028
  (0, 49677)	0.4548641685301028
  (0, 57029)	0.4548641685301028
  (0, 31209)	0.4354858003208915
  (0, 42986)	0.4354858003208915 america phonies amendment requires another cap cincinnati


In [36]:
train_data[0]

<1x62407 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [31]:
# training the model using logisticRegression with multinomial
####???? if there is a better library to use and a better classifier to use, should i svm, ridge classifier?

# Logistic Regression
lg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
lg.fit(train_data, train_labels)

print("Accuracy on test set: {:.02%}".format(lg.score(test_data, test_labels)))
# print(lg.score(test_data, test_labels))

#Ridge Classifier
rc = RidgeClassifier()
rc.fit(train_data, train_labels)

pred = rc.predict(test_data)

print("Accuracy on test set: {:.02%}".format(rc.score(test_data, test_labels)))
#print(rc.score(test_data, test_labels))
#print(classification_report(test_labels, pred))


Accuracy on test set: 15.21%
Accuracy on test set: 14.66%


In [32]:
# using naive bayes
nb = MultinomialNB()
nb.fit(train_data, train_labels)
y_pred = nb.predict(test_data)
acc = accuracy_score(test_labels, y_pred)
print("Accuracy on test set: {:.02%}".format(acc))

Accuracy on test set: 13.90%


In [27]:
# using MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100, ),)
mlp.fit(train_data, train_labels)
y_pred = mlp.predict(test_data)
acc = accuracy_score(test_labels, y_pred)
print("Accuracy on test set: {:.02%}".format(acc))

Accuracy on test set: 13.39%


