In [106]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy.stats import spearmanr

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, make_scorer

import warnings
warnings.filterwarnings('ignore')

In [2]:
PATH_RAW = '../data/raw_data/'
PATH_INTER = '../data/intermediate/'
PATH_DATA = '../data/data/'
PATH_MODELS = '../../models/'
PATH_IMAGES = '../assets/images/'

In [75]:
train = pd.read_csv(PATH_DATA + "CRC_2880_train.csv")
val = pd.read_csv(PATH_DATA + "CRC_320_val.csv")
test_CRC = pd.read_csv(PATH_DATA + "CRC_800_test.csv")

In [77]:
train.loc[train['read_score'] < 0.0, 'read_score'] = 0.0
train.loc[train['read_score'] > 100.0, 'read_score'] = 100.0

In [78]:
def plot_value_counts(df, col):
    """Plot value counts of a column"""
        
    plt.figure(figsize = (12, 5))
    df[col].value_counts().sort_index().plot.bar(color = 'blue',
                                                 edgecolor = 'k',
                                                 linewidth = 1.5)
    plt.xlabel(f'{col}'); plt.title(f'{col} Value Counts'); plt.ylabel('Count')
    plt.show();

def plot_categoricals(x, y, data, annotate = True):
    """Plot counts of two categoricals.
    Size is raw count for each grouping.
    Percentages are for a given value of y."""
    
    # Raw counts 
    raw_counts = pd.DataFrame(data.groupby(y)[x].value_counts(normalize = False))
    raw_counts = raw_counts.rename(columns = {x: 'raw_count'})
    
    # Calculate counts for each group of x and y
    counts = pd.DataFrame(data.groupby(y)[x].value_counts(normalize = True))
    
    # Rename the column and reset the index
    counts = counts.rename(columns = {x: 'normalized_count'}).reset_index()
    counts['percent'] = 100 * counts['normalized_count']
    
    # Add the raw count
    counts['raw_count'] = list(raw_counts['raw_count'])
    
    plt.figure(figsize = (14, 10))
    # Scatter plot sized by percent
    plt.scatter(counts[x], counts[y], edgecolor = 'k', color = 'lightgreen',
                s = 100 * np.sqrt(counts['raw_count']), marker = 'o',
                alpha = 0.6, linewidth = 1.5)
    
    if annotate:
        # Annotate the plot with text
        for i, row in counts.iterrows():
            # Put text with appropriate offsets
            plt.annotate(xy = (row[x] - (1 / counts[x].nunique()), 
                               row[y] - (0.15 / counts[y].nunique())),
                         color = 'navy',
                         s = f"{round(row['percent'], 1)}%")
        
    # Set tick marks
    plt.yticks(counts[y].unique())
    plt.xticks(counts[x].unique())
    
    # Transform min and max to evenly space in square root domain
    sqr_min = int(np.sqrt(raw_counts['raw_count'].min()))
    sqr_max = int(np.sqrt(raw_counts['raw_count'].max()))
    
    # 5 sizes for legend
    msizes = list(range(sqr_min, sqr_max,
                        int(( sqr_max - sqr_min) / 5)))
    markers = []
    
    # Markers for legend
    for size in msizes:
        markers.append(plt.scatter([], [], s = 100 * size, 
                                   label = f'{int(round(np.square(size) / 100) * 100)}', 
                                   color = 'lightgreen',
                                   alpha = 0.6, edgecolor = 'k', linewidth = 1.5))
        
    # Legend and formatting
    plt.legend(handles = markers, title = 'Counts',
               labelspacing = 3, handletextpad = 2,
               fontsize = 16,
               loc = (1.10, 0.19))
    
    plt.annotate(f'* Size represents raw count while % is for a given y value.',
                 xy = (0, 1), xycoords = 'figure points', size = 10)
    
    # Adjust axes limits
    plt.xlim((counts[x].min() - (6 / counts[x].nunique()), 
              counts[x].max() + (6 / counts[x].nunique())))
    plt.ylim((counts[y].min() - (4 / counts[y].nunique()), 
              counts[y].max() + (4 / counts[y].nunique())))
    plt.grid(None)
    plt.xlabel(f"{x}"); plt.ylabel(f"{y}"); plt.title(f"{y} vs {x}");


In [80]:
correl = train.corr()

correl['tag'].sort_values(ascending=False)

tag                1.000000
bin_tag            0.866370
num_positives      0.461502
num_tokens         0.389669
num_unk            0.382112
num_char           0.381596
sentiment          0.366212
num_punct          0.357476
num_discourse      0.345012
num_modals         0.314870
num_negatives      0.272517
named_entities     0.178657
num_upper          0.070443
avg_word_length   -0.060761
read_score        -0.290127
Name: tag, dtype: float64

In [117]:
X_train, y_train, y_bin_train = train.drop(['tag', 'bin_tag'], axis=1), train['tag'].copy(), train['bin_tag'].copy()
X_val, y_val, y_bin_val = val.drop(['tag', 'bin_tag'], axis=1), val['tag'].copy(), val['bin_tag'].copy()
X_test_CRC, y_test_CRC, y_bin_test_CRC = test_CRC.drop(['tag', 'bin_tag'], axis=1), test_CRC['tag'].copy(), test_CRC['bin_tag'].copy()

In [118]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   text_review      2880 non-null   object 
 1   read_score       2880 non-null   float64
 2   sentiment        2880 non-null   float64
 3   num_upper        2880 non-null   int64  
 4   named_entities   2880 non-null   int64  
 5   num_positives    2880 non-null   int64  
 6   num_negatives    2880 non-null   int64  
 7   num_unk          2880 non-null   int64  
 8   num_punct        2880 non-null   int64  
 9   num_discourse    2880 non-null   int64  
 10  num_modals       2880 non-null   int64  
 11  num_tokens       2880 non-null   int64  
 12  num_char         2880 non-null   int64  
 13  avg_word_length  2880 non-null   float64
 14  text_pos         2880 non-null   object 
 15  lemmas           2880 non-null   object 
dtypes: float64(3), int64(10), object(3)
memory usage: 360.1+ KB


In [119]:
# Custom scorer for cross validation
scorer = make_scorer(f1_score, greater_is_better=True, average = 'weighted')

numeric_features = list(X_train[['read_score', 'sentiment', 'num_tokens',
                                 'num_char', 'num_upper', 'named_entities',
                                 'num_positives', 'num_negatives', 'num_unk',
                                 'num_punct', 'num_discourse', 'num_modals',
                                 'avg_word_length']])

text_features = list(X_train[['text_review', 'text_pos', 'lemmas']])

In [120]:
trans = ColumnTransformer(transformers=[
            ('review', TfidfVectorizer(min_df=1, max_df=0.1, encoding='utf-8', ngram_range=(1,2)), 'text_review'),
            ('pos', TfidfVectorizer(encoding='utf-8', ngram_range=(1,2)), 'text_pos'),
            ('lemma', TfidfVectorizer(min_df=1, max_df=0.1, encoding='utf-8', ngram_range=(1,2)), 'lemmas'),
            ('num_attr', StandardScaler(), numeric_features),
            ])

pipeline = Pipeline([
               ('union', trans),
               ('clf', RandomForestClassifier(n_estimators=100, random_state=10, 
                               n_jobs = -1))
])

In [122]:
X_train_trans = pipeline.fit_transform(X_train)

TypeError: Singleton array array(None, dtype=object) cannot be considered a valid collection.

In [112]:
cv_score = cross_val_score(pipeline, X_train, y_train, cv = 10, scoring = scorer)

In [113]:
cv_score

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])