In [3]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

#Read Data from label.txt and source_tweets.txt to a panda dataframe
label_dataframe = pd.read_csv("label.txt",sep=':',index_col=1,header=None,names=['label','index'])
one_hot_label_dataframe = pd.get_dummies(label_dataframe,prefix=['label'])
label_dataframe = pd.concat([label_dataframe,one_hot_label_dataframe], axis=1) 
tweets_dataframe = pd.read_csv("source_tweets.txt",sep='\t',index_col=0,header=None,names=['index','text'])
tweets_dataframe = pd.concat([tweets_dataframe,label_dataframe], axis=1)  

#Vectorize the text 
#Can use TfidfVectorizer or CountVectorizer
#vectorizer = TfidfVectorizer()
vectorizer = CountVectorizer(decode_error='ignore')
X = vectorizer.fit_transform(tweets_dataframe['text'])
Y = tweets_dataframe['label']

#Model available, can add new sklearn learn classification model here
models = {
  'Naive Bayes':MultinomialNB(),
  'Ada Boost':AdaBoostClassifier(),
  'Random Forest':RandomForestClassifier(n_estimators=100),
  'SVM':svm.SVC(gamma='scale')
}

#Hold-out validation 
#Seperate the data to train data and test data
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.2)
for key  in models:
  print('='*80)
  print(f'Model: {key}')
  model = models[key]
  model.fit (Xtrain,Ytrain)
  print(f'Traing Score: {model.score(Xtrain,Ytrain)}')
  print(f'Test Score: {model.score(Xtest,Ytest)}')
  predict_string = 'This is a book'
  predict_result = model.predict(vectorizer.transform([predict_string]))
  print (f'Predict result for "{predict_string}" is "{predict_result[0]}"')
  print('='*80)

#Cross Validation
for key  in models:
  print('='*80)
  print(f'Model: {key}')
  model = models[key]
  scores = cross_val_score(model, X, Y, cv=5)
  mean_score = np.mean(scores)
  print(f'Scores: {scores}')
  print(f'Mean Score: {mean_score}')
  predict_string = 'This is a book'
  predict_result = model.predict(vectorizer.transform([predict_string]))
  print (f'Predict result for "{predict_string}" is "{predict_result[0]}"')
  print('='*80)

Model: Naive Bayes
Traing Score: 0.9832214765100671
Test Score: 0.7953020134228188
Predict result for "This is a book" is "false"
Model: Ada Boost
Traing Score: 0.5780201342281879
Test Score: 0.5033557046979866
Predict result for "This is a book" is "non-rumor"
Model: Random Forest
Traing Score: 1.0
Test Score: 0.802013422818792
Predict result for "This is a book" is "false"
Model: SVM
Traing Score: 0.9932885906040269
Test Score: 0.7248322147651006
Predict result for "This is a book" is "false"
Model: Naive Bayes
Scores: [0.75919732 0.76923077 0.79865772 0.78187919 0.78716216]
Mean Score: 0.7792254337118651
Predict result for "This is a book" is "false"
Model: Ada Boost
Scores: [0.5451505  0.56856187 0.52348993 0.52684564 0.55405405]
Mean Score: 0.5436203998211584
Predict result for "This is a book" is "non-rumor"
Model: Random Forest
Scores: [0.76923077 0.7826087  0.77516779 0.77852349 0.78716216]
Mean Score: 0.7785385804425781
Predict result for "This is a book" is "false"
Model: SVM