In [1]:
from TextPreprocessor import *
import pandas as pd

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

TextPreprocessor : v1.2


### Read the excel file and remove blank lines in the `clean` column

In [24]:
df = pd.read_excel("Data/5.CleanText/SportEMM_Violence.xlsx")

In [25]:
len(df)

548

In [26]:
df = df.dropna(axis=0, subset=['clean'])

In [27]:
len(df)

470

### We split the file between lines that have been evaluated and the one that have not

In [28]:
df_fit = df[(df.Evaluation == "T") | (df.Evaluation == "F")]
df_inconnu = df[~((df.Evaluation == "T") | (df.Evaluation == "F"))]

In [29]:
len(df_inconnu)

0

### We extract the texts and the evaluations

In [16]:
X_all = df.clean.values
X_eval = df_fit.clean.values
X_inconnu = df_inconnu.clean.values
y_eval = df_fit.Evaluation.values

### We prepare the Pipeline to transform the data and the TFIDF vectoriezr

In [17]:
transform = Pipeline([
    ('token', NLTK_Tokenizer()),
    ('clean', NLTK_Cleaner()),
    ('lemmatize', NLTK_Lemmatizer()),
])

tfidf_transformer = TfidfVectorizer(tokenizer=lambda x:x, lowercase=False)

### We fit / transform both groups (evaluated and not).
Here, the fit methods of the NLTK Tokenizer, Cleaner and Lemmatizer do nothing. 

In [18]:
X_transform_eval = transform.fit_transform(X_eval)
X_transform_inconnu = transform.fit_transform(X_inconnu)


In [22]:
len(X_transform_inconnu)

0

### We fit the Vectorizer on all the documents and transform the whole
Fit on all the documents means that all the documents will be taken into account when computing the frequencies of each terms. It also means that all the words will be prensent

In [19]:
tfidf_transformer.fit(X_transform_eval + X_transform_inconnu)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function <lambda> at 0x000001FE63D10BF8>, use_idf=True,
        vocabulary=None)

In [20]:
X_vect_eval = tfidf_transformer.transform(X_transform_eval)
X_vect_inconnu = tfidf_transformer.transform(X_transform_inconnu)

ValueError: Found array with 0 sample(s) (shape=(0, 19867)) while a minimum of 1 is required by the normalize function.

### We split the already evaluated set into Train / Test sets

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_vect_eval, y_eval, test_size=0.33, random_state=42)

### We select a classifier
Here it's SVM

In [35]:
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)

### We fit the model on the train set

In [36]:
clf.fit(X_train, y_train)

SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

### We predict the test set and look at the score

In [38]:
pred = clf.predict(X_test)

In [39]:
print(metrics.classification_report(y_test, pred))

             precision    recall  f1-score   support

          F       0.93      0.95      0.94        93
          T       0.91      0.87      0.89        55

avg / total       0.92      0.92      0.92       148



### We predict the unknown 

In [42]:
pred_inconnu = clf.predict(X_vect_inconnu)

### We put the results back into the dataframe

In [45]:
df_inconnu.Evaluation = pred_inconnu

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [48]:
df_inconnu.Evaluation.describe()

count     342
unique      2
top         F
freq      229
Name: Evaluation, dtype: object

### We merge the dataframes to have one table

In [53]:
df_final = df_fit.append(df_inconnu)

In [55]:
df_final.Evaluation.describe()

count     788
unique      2
top         F
freq      496
Name: Evaluation, dtype: object

### We write to Excel

In [56]:
writer = pd.ExcelWriter(r'Predicted.xlsx', engine='xlsxwriter', options={'strings_to_urls': False})
df_final.to_excel(writer)
writer.close()

### The classifier can be put into a Grid_Search in order to test mulitples parameters
Here, it is also possible to test multiple classifiers with multiple parameters for each.

In [None]:
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)

parameters = {'alpha': (1e-5, 1e-6, 1e-7),
             'penalty': ('l2','elasticnet'),
             'max_iter': (5,10,50,80)}

gs = GridSearchCV(clf, parameters, n_jobs=-1)
gs.fit(X_train, y_train)

pred = gs.predict(X_test)