## Read in the Data

In [1]:
import pandas as pd

dataset = pd.read_csv("train.csv")
dataset.head()

Unnamed: 0,profile pic,nums/length username,fullname words,nums/length fullname,name==username,description length,external URL,private,#posts,#followers,#follows,fake
0,1,0.27,0,0.0,0,53,0,0,32,1000,955,0
1,1,0.0,2,0.0,0,44,0,0,286,2740,533,0
2,1,0.1,2,0.0,0,0,0,1,13,159,98,0
3,1,0.0,1,0.0,0,82,0,0,679,414,651,0
4,1,0.0,2,0.0,0,0,0,1,6,151,126,0


In [2]:
train_features = dataset.iloc[:, :-1]
train_labels = dataset['fake']

### Read in Test Set

In [3]:
dataset_test = pd.read_csv("test.csv")
test_features = dataset_test.iloc[:, :-1]
test_labels = dataset_test['fake']
dataset_test.head()

Unnamed: 0,profile pic,nums/length username,fullname words,nums/length fullname,name==username,description length,external URL,private,#posts,#followers,#follows,fake
0,1,0.33,1,0.33,1,30,0,1,35,488,604,0
1,1,0.0,5,0.0,0,64,0,1,3,35,6,0
2,1,0.0,2,0.0,0,82,0,1,319,328,668,0
3,1,0.0,1,0.0,0,143,0,1,273,14890,7369,0
4,1,0.5,1,0.0,0,76,0,1,6,225,356,0


# Random Forest Classification

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train_features, train_labels)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [5]:
predictions = clf.predict(test_features)
print(predictions)

[0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0
 1 1 1 1 1 1 0 1 1]


In [6]:
from sklearn.metrics import accuracy_score

accuracy_score(predictions, test_labels)

0.8833333333333333

In [11]:
from sklearn.model_selection import cross_val_score
import numpy as np

# 10-Fold Cross validation
print(cross_val_score(clf, train_features, train_labels, cv=10))
print(np.mean(cross_val_score(clf, train_features, train_labels, cv=10)))

[0.98275862 0.9137931  0.86206897 0.9137931  0.87931034 0.84482759
 0.96551724 0.89655172 0.76785714 0.89285714]
0.8919334975369457


# Support Vector Machine

In [13]:
from sklearn import svm

svm_clf = svm.SVC()
svm_clf.fit(train_features, train_labels)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [17]:
svm_predictions = svm_clf.predict(test_features)
print(svm_predictions)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 1 0 1
 1 1 1 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 1]


In [18]:
accuracy_score(svm_predictions, test_labels)

0.6333333333333333

In [19]:
print(cross_val_score(svm_clf, train_features, train_labels, cv=10))

[0.68965517 0.75862069 0.55172414 0.63793103 0.67241379 0.62068966
 0.74137931 0.81034483 0.625      0.57142857]




### Sources:
1. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
2. https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
