In [65]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import scipy

In [66]:
df = pd.read_csv('Task_1_prepprocessed.csv')

In [67]:
df.dropna(inplace=True)

In [68]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [69]:
X.head(10)

Unnamed: 0,date,from,to,subject,body
0,4,info@global-change.com,michelle.lokay@enron.com,next wave energi trade,energi industri profession global chang associ...
1,1,info@pmaconference.com,michelle.lokay@enron.com,regist next txu capac auction,regist next txu energi capac auction new regis...
2,6,info@pmaconference.com,michelle.lokay@enron.com,merchant power monthli free sampl,merchant power monthli month s issu almost mw ...
3,3,bruno@firstconf.com,energynews@fc.ease.lsoft.com,eyeforenergi updat,welcom week s eyeforenergi updat refresh memor...
4,1,deanrogers@energyclasses.com,michelle.lokay@enron.com,deriv earli bird til march houston,deriv energi profession two full day april ear...
5,0,10182829@mbox.surecom.com,michelle.lokay@enron.com,power outag outag free inform,energi argu power plant outag servic come comp...
6,3,info@pmaconference.com,michelle.lokay@enron.com,market price volatil may houston,market price volatil may houston tx click down...
7,1,allison@firstconf.com,michelle.lokay@enron.com,weather trade risk manag latest develop,dear michel weather trade market open busi eur...
8,3,info@energyclasses.com,michelle.lokay@enron.com,transmiss studi avail,u s transmiss industri report energi info sour...
9,0,info@pmaconference.com,michelle.lokay@enron.com,new power execut free sampl issu,welcom download free trial issu new power exec...


In [70]:
y.head(10)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: class, dtype: int64

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=65)

# 1

In [72]:
vectorizer = TfidfVectorizer()
X_train_1 = vectorizer.fit_transform(X_train.body)
X_test_1 = vectorizer.transform(X_test.body)

In [73]:
model = RandomForestClassifier(n_estimators=10, n_jobs=10, random_state=65)

In [74]:
model.fit(X_train_1, y_train)

RandomForestClassifier(n_estimators=10, n_jobs=10, random_state=65)

In [75]:
y_pred = model.predict(X_test_1)

In [76]:
metrics = classification_report(y_test, y_pred, digits=3, output_dict=True)

In [77]:
print('Precision: ', round(metrics['macro avg']['precision'],3))
print('Recall: ', round(metrics['macro avg']['recall'],3))
print('F1: ', round(metrics['macro avg']['f1-score'],3))

Precision:  0.967
Recall:  0.963
F1:  0.965


# 2

In [78]:
vectorizer = TfidfVectorizer()

In [79]:
X_train_2 = scipy.sparse.hstack((vectorizer.fit_transform(X_train.body + X_train.subject), X_train.date.to_numpy().reshape((-1, 1))))
X_test_2 = scipy.sparse.hstack((vectorizer.transform(X_test.body + X_test.subject), X_test.date.to_numpy().reshape((-1, 1))))

In [80]:
model = RandomForestClassifier(n_estimators=10, random_state=65)
model.fit(X_train_2, y_train)

RandomForestClassifier(n_estimators=10, random_state=65)

In [81]:
y_pred = model.predict(X_test_2)
metrics = classification_report(y_test, y_pred, digits=3, output_dict=True)

In [82]:
print('Precision: ', round(metrics['macro avg']['precision'], 3))
print('Recall: ', round(metrics['macro avg']['recall'], 3))
print('F1: ', round(metrics['macro avg']['f1-score'], 3))

Precision:  0.968
Recall:  0.965
F1:  0.966


# 3

In [83]:
vectorizer = TfidfVectorizer(ngram_range=(2, 2))

In [84]:
X_train_3 = vectorizer.fit_transform(X_train.body)
X_test_3 = vectorizer.transform(X_test.body)

In [85]:
model = RandomForestClassifier(n_estimators=10, n_jobs=10, random_state=65)

In [86]:
model.fit(X_train_3, y_train)

RandomForestClassifier(n_estimators=10, n_jobs=10, random_state=65)

In [87]:
y_pred = model.predict(X_test_3)
metrics = classification_report(y_test, y_pred, digits=3, output_dict=True)

In [88]:
print('Precision: ', round(metrics['macro avg']['precision'], 3))
print('Recall: ', round(metrics['macro avg']['recall'], 3))
print('F1: ', round(metrics['macro avg']['f1-score'], 3))

Precision:  0.953
Recall:  0.947
F1:  0.949


In [89]:
with open('task_1_P_test_2.csv', 'r') as f:
    text = f.read()

In [90]:
model.predict(vectorizer.transform([text]))

array([0], dtype=int64)

In [91]:
model.predict_proba(vectorizer.transform([text]))

array([[0.7, 0.3]])

In [92]:
round(_[0][0], 3)

0.7