# Exercice 2 : Api Release

In [2]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score

In [2]:
wine = load_wine()

In [3]:
data = pd.DataFrame(data= np.c_[wine['data'], wine['target']],
                     columns= wine['feature_names'] + ['target'])
data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.0


In [4]:
X_train = data[:-20]
X_test = data[-20:]

y_train = X_train.target
y_test = X_test.target

X_train = X_train.drop('target',1)
X_test = X_test.drop('target',1)

In [5]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [6]:
y_pred = clf.predict(X_test)

In [7]:
print("accuracy_score: %.2f"
      % accuracy_score(y_test, y_pred))

accuracy_score: 0.85


In [8]:
import pickle
pickle.dump(clf, open('models/final_prediction.pickle', 'wb'))

# Exercice 3

## Load dataset

In [3]:
df = pd.read_csv("https://s3.amazonaws.com/assets.datacamp.com/blog_assets/fake_or_real_news.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


## Cleaning

In [4]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [5]:
import nltk
nltk.download('punkt')
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import spacy
!python -m spacy download en_core_web_sm
import en_core_web_sm
import re

[nltk_data] Downloading package punkt to /Users/thomasduv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thomasduv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [6]:
articles_list_words =[]
for index, row in df.iterrows():
    article = row['text']
    article_words = nltk.word_tokenize(article, language='english')
    articles_list_words.append(article_words)

In [7]:
stopWords = set(stopwords.words('english'))
p = re.compile('[^\w\s]|_|\d+|^\w$')
corpusCleaned = []
for list_of_words in articles_list_words:
    new_words = []
    for word in list_of_words:
        if not word in stopWords:
            if not p.match(word):
                new_words.append(word.lower())
    corpusCleaned.append(new_words)

## Train & Test sets

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
y = df.label

In [10]:
df['text']=corpusCleaned

In [11]:
df.drop("label", axis=1)

Unnamed: 0.1,Unnamed: 0,title,text
0,8476,You Can Smell Hillary’s Fear,"[daniel, greenfield, shillman, journalism, fel..."
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,"[google, pinterest, digg, linkedin, reddit, st..."
2,3608,Kerry to go to Paris in gesture of sympathy,"[u.s., secretary, state, john, f., kerry, said..."
3,10142,Bernie supporters on Twitter erupt in anger ag...,"[kaydee, king, kaydeeking, november, the, less..."
4,875,The Battle of New York: Why This Primary Matters,"[it, primary, day, new, york, front-runners, h..."
...,...,...,...
6330,4490,State Department says it can't find emails fro...,"[the, state, department, told, republican, nat..."
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,"[the, pbs, should, stand, plutocratic, pentago..."
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,"[anti-trump, protesters, are, tools, oligarchy..."
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","[addis, ababa, ethiopia, obama, convened, meet..."


In [12]:
Xtrain, Xtest, ytrain, ytest = train_test_split(df['text'], y,
                                                random_state=1)

## Models

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import confusion_matrix

In [15]:
def identity_tokenizer(text):
        return text
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)  

#tfidf = TfidfVectorizer(lowercase=False)
train_vectors = tfidf.fit_transform(Xtrain)

In [16]:
test_vectors = tfidf.transform(Xtest)

In [17]:
pa_classifier=PassiveAggressiveClassifier(max_iter=50)
pa_classifier.fit(train_vectors,ytrain)

PassiveAggressiveClassifier(max_iter=50)

In [18]:
y_pred=pa_classifier.predict(test_vectors)
score=accuracy_score(ytest,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 94.76%


In [19]:
confusion_matrix(ytest,y_pred, labels=['FAKE','REAL'])

array([[775,  42],
       [ 41, 726]])

## Release our model

In [20]:
import pickle
pickle.dump(pa_classifier, open('models/TP7EX3_prediction.pickle', 'wb'))

In [21]:
pickle.dump(tfidf, open('models/tfidf.pickle', 'wb'))