In [None]:
# Import and examine the dataset
import pandas as pd

all_tweets = pd.read_json(r'C:/Users/Yeo Kheng Feng/Desktop/twitter_classification_project/random_tweets.json', encoding = "Latin-1", lines=True)

print(len(all_tweets))
print(all_tweets.columns)
print(all_tweets.loc[0]['text']) # The 'text' column contain the tweet

#Print the 'user' column here.
print(all_tweets.loc[0]["user"])

In [None]:
# Defining a viral tweet
print(all_tweets["retweet_count"].describe())

In [None]:
import numpy as np

all_tweets['is_viral'] = np.where(all_tweets['retweet_count'] > 429, 1, 0)
print(all_tweets['is_viral'].value_counts())

In [None]:
# Making features

all_tweets['tweet_length'] = all_tweets.apply(lambda tweet: len(tweet['text']), axis=1)
all_tweets['followers_count'] = all_tweets.apply(lambda tweet: tweet['user']['followers_count'], axis=1)
all_tweets['friends_count'] = all_tweets.apply(lambda tweet: tweet['user']['friends_count'], axis=1)
all_tweets['hashtags_count'] = all_tweets.apply(lambda tweet: tweet['text'].count('\#'), axis=1)
all_tweets['links_count'] = all_tweets.apply(lambda tweet: tweet['text'].count('http'), axis=1)
all_tweets['words_count'] = all_tweets.apply(lambda tweet: len(tweet['text'].split()), axis=1)
all_tweets['average_length_count'] = all_tweets.apply(lambda tweet: sum(len(word) for word in tweet['text'].split()) / len(tweet['text'].split()) , axis=1)
print(all_tweets.head())

In [None]:
# Examine features relationship with labels
features = ['tweet_length', 'followers_count', 'friends_count', 'hashtags_count', 'words_count', 'links_count', 'average_length_count']
for feature in features:
    print('Correlation coefficient of retweet_counts with ' + feature + ': ' + str(round(np.corrcoef(all_tweets['retweet_count'], all_tweets[feature])[0,1], 5)))

In [None]:
# Normalising the data
from sklearn.preprocessing import scale

labels = all_tweets['is_viral']
data = all_tweets[['tweet_length', 'followers_count', 'friends_count', 'hashtags_count', 'words_count', 'links_count', 'average_length_count']]
scaled_data = scale(data, axis=0)
print(data.loc[0])
print(scaled_data[0])

In [None]:
#Split data into training and test sets
from sklearn.model_selection import train_test_split

train_data, test_data, train_labels, test_labels = train_test_split(scaled_data, labels, test_size=0.2, random_state=1)

In [None]:
# Train logistic regression model
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(train_data, train_labels)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

predictions = log_reg.predict(test_data)
print(accuracy_score(test_labels, predictions))
print(recall_score(test_labels, predictions))
print(precision_score(test_labels, predictions))
print(f1_score(test_labels, predictions))

In [None]:
# Train k-Nearest neighbors model
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

scores = []
for k in range(1, 500):
    classifier = KNeighborsClassifier(n_neighbors=k)
    classifier.fit(train_data, train_labels)
    scores.append(classifier.score(test_data, test_labels))

# Print classifier accuracy over a range of k
plt.plot(range(1,500), scores)
plt.xlabel('k neighbors')
plt.ylabel('Classifier Accuracy')
plt.show()

In [None]:
scores = []
for k in range(1, 20):
    classifier = KNeighborsClassifier(n_neighbors=k)
    classifier.fit(train_data, train_labels)
    scores.append(classifier.score(test_data, test_labels))
    
plt.plot(range(1,20), scores)
plt.xlabel('k neighbors')
plt.ylabel('Classifier Accuracy')
plt.show()

In [None]:
#from sklearn.svm import SVC

#svm_clf = SVC(kernel='poly', degree=3, coef0=1, C=5)
#svm_clf.fit(train_data, train_labels)
#svm_clf.score(test_data, test_labels)

In [None]:
#from sklearn.ensemble import VotingClassifier
#from sklearn.metrics import accuracy_score

#log_clf = LogisticRegression()
#knn_clf = KNeighborsClassifier(n_neighbors=10)

#voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('knn', knn_clf)], voting='hard')
#voting_clf.fit(train_data, train_labels)

#for clf in (log_clf, knn_clf, voting_clf):
 #   clf.fit(train_data, train_labels)
  #  y_pred = clf.predict(test_data)
   # print(accuracy_score(test_labels, y_pred))