# Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from typing import List, Union

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
import sys
sys.path.append(r'D:\users\trenton\Dropbox\PythonProjects\DataCamp\functions')

from multilabel import multilabel_train_test_split
from score_sub import score_submission

In [None]:
pd.set_option('display.max_columns', 700)
pd.set_option('display.max_rows', 400)
pd.set_option('display.min_rows', 10)
pd.set_option('display.expand_frame_repr', True)

# Saving data from DataCamp to a local csv

```python
# output the data to a format that can be copied and pasted into notepad++
# find all non and replace with np.nan
# copy all of lists and replace ... in data = np.array([...])
# print the columns names df.columns.tolist() and then create a variable in a notebook
# create a dataframe with df = pd.DataFrame(data=data, columns=columns)

# create the dataframe in the DataCamp console
df = pd.read_csv('TrainingData.csv')  # do not specify index_col=0

# print each row as a list with a comma at the end and then copy all of the lists into notepad++
for r in df.to_numpy():
    print(list(r), ',')

# print the column names and copy them
df.columns.tolist()

# in jupyterlab
data = np.array([...])  # replace ... with the cleaned lists from notepad++
columns = ...  # pasted from DataCamp console

df = pd.DataFrame(data=data, columns=columns)

df.to_csv('TrainingData.csv', index=False)
```

# Exploring the Raw Data

In [None]:
df = pd.read_csv('data/2024-01-19_school_budgeting_with_maching_learning_in_python/TrainingData.csv', index_col=0)

In [None]:
df.head()

In [None]:
df.info()

## Summarize the Data

In [None]:
# Print the summary statistics
df.describe()

In [None]:
# Create the histogram
plt.hist(df['FTE'].dropna(), ec='k')  # .dropna() isn't needed

# Add title and labels
plt.title('Distribution of %full-time \n employee works')
plt.xlabel('% of full-time')
plt.ylabel('num employees')

# Display the histogram
plt.show()

**The high variance in expenditures makes sense (some purchases are cheap some are expensive). Also, it looks like the FTE column is bimodal. That is, there are some part-time and some full-time employees.**

In [None]:
df.dtypes.value_counts()

## Encode the Labels as Categorical Variables

In [None]:
LABELS_td = ['Function', 'Use', 'Sharing', 'Reporting', 'Student_Type', 'Position_Type', 'Object_Type', 'Pre_K', 'Operating_Status']
categorize_label = lambda x: x.astype('category')
df[LABELS_td] = df[LABELS_td].apply(categorize_label)
print(df[LABELS_td].dtypes)

## Counting Unique Labels

In [None]:
num_unique_labels = df[LABELS_td].apply(pd.Series.nunique)
ax = num_unique_labels.plot(kind='bar', ylabel='Number of Unique Categories', xlabel='Column Name')

## `def compute_log_loss`

In [None]:
def compute_log_loss(predicted: Union[float, List[float]], actual: [int, List[int]], eps: float=1e-14) -> float:
    """
    Computes the logarithmic loss between predicted and 
    actual when these are 1D arrays

    :param predicted: The predicted probabilities as floats between 0-1
    :param actual: The actual binary labels. Either 0 or 1
    :param eps (optional): log(0) is inf, so we need to offset our
                           precidted values slightly by eps from 0 to 1.
    """

    predicted = np.clip(predicted, eps, 1 - eps)
    loss = -1 * np.mean(actual * np.log(predicted)
                        + (1 - actual)
                        * np.log(1 - predicted))
    return loss

In [None]:
data = [(0.85, 1), (0.99, 0), (0.51, 0)]

for p, y in data:
    print(compute_log_loss(p, y))

**Lowest: A, Middle: C, Highest: B**

**Of the two incorrect predictions, B will have a higher log loss because it is confident and wrong.**

## Computing log loss with Numpy

In [None]:
correct_confident = np.array([0.95, 0.95, 0.95, 0.95, 0.95, 0.05, 0.05, 0.05, 0.05, 0.05])
correct_not_confident = np.array([0.65, 0.65, 0.65, 0.65, 0.65, 0.35, 0.35, 0.35, 0.35, 0.35])
wrong_not_confident = np.array([0.35, 0.35, 0.35, 0.35, 0.35, 0.65, 0.65, 0.65, 0.65, 0.65])
wrong_confident = np.array([0.05, 0.05, 0.05, 0.05, 0.05, 0.95, 0.95, 0.95, 0.95, 0.95])
actual_labels = np.array([1., 1., 1., 1., 1., 0., 0., 0., 0., 0.])

In [None]:
# Compute and print log loss for 1st case
correct_confident_loss = compute_log_loss(correct_confident, actual_labels)
print("Log loss, correct and confident: {}".format(correct_confident_loss)) 

# Compute log loss for 2nd case
correct_not_confident_loss = compute_log_loss(correct_not_confident, actual_labels)
print("Log loss, correct and not confident: {}".format(correct_not_confident_loss)) 

# Compute and print log loss for 3rd case
wrong_not_confident_loss = compute_log_loss(wrong_not_confident, actual_labels)
print("Log loss, wrong and not confident: {}".format(wrong_not_confident_loss)) 

# Compute and print log loss for 4th case
wrong_confident_loss = compute_log_loss(wrong_confident, actual_labels)
print("Log loss, wrong and confident: {}".format(wrong_confident_loss)) 

# Compute and print log loss for actual labels
actual_labels_loss = compute_log_loss(actual_labels, actual_labels)
print("Log loss, actual labels: {}".format(actual_labels_loss)) 

**Log loss penalizes highly confident wrong answers much more than any other type. This will be a good metric to use on your models.**

# Creating a simple first model

## It's time to build a model

```python
data_to_train = df[NUMERIC_COLUMNS].fillna(-1000)
labels_to_use = pd.get_dummies(df[LABELS])

X_train, X_test, y_train, y_test = multilabel_train_test_split(data_to_train, labels_to_use, size=0.2, seed=123)

clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_train, y_train)
```

## Setting up a train-test split in scikit-learn

In [None]:
NUMERIC_COLUMNS_td = df.select_dtypes(include='number').columns.tolist()

# Create the new DataFrame: numeric_data_only
numeric_data_only = df[NUMERIC_COLUMNS_td].fillna(-1000)

# Get labels and convert to dummy variables: label_dummies
label_dummies = pd.get_dummies(df[LABELS_td])

# Create training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(numeric_data_only,
                                                               label_dummies,
                                                               size=0.2, 
                                                               seed=123)

# Print the info
print("X_train info:")
print(X_train.info())
print("\nX_test info:")  
print(X_test.info())
print("\ny_train info:")  
print(y_train.info())
print("\ny_test info:")  
print(y_test.info()) 

## Training a model

In [None]:
# Instantiate the classifier: clf
clf = OneVsRestClassifier(LogisticRegression())

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Print the accuracy
print("Accuracy: {}".format(clf.score(X_test, y_test)))

**The good news is that your workflow didn't cause any errors. The bad news is that your model scored the lowest possible accuracy: 0.0! But hey, you just threw away ALL of the text data in the budget. Later, you won't. Before you add the text data, let's see how the model does when scored by log loss.**

## Making predictions

```python
houldout = pd.read_csv('HoldoutData.csv', index_col=0)
holdout = holdout[NUMERIC_COLUMNS].fillna(-1000)
predictions = clf.predict_proba(holdout)
```

- Using `.predict()`
  - would result in an output of 0 or 1
  - Log loss penalized being confident and wrong
  - Worse performance compared to `.predict_proba()`

- Format and submit predictions
```python
prediction_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS], prefix_sep='__').columns, index=houldout.index, data=predictions)
prediction_df.to_csv('predictions.csv')
score = score_submission(pred_path='predictions.csv')
```

## Use your model to predict values on holdout data

In [None]:
# Instantiate the classifier: clf
clf = OneVsRestClassifier(LogisticRegression())

# Fit it to the training data
clf.fit(X_train, y_train)

# Load the holdout data: holdout
holdout = pd.read_csv('data/2024-01-19_school_budgeting_with_maching_learning_in_python/HoldoutData.csv', index_col=0)

# Generate predictions: predictions
NUMERIC_COLUMNS_hd = holdout.select_dtypes(include='number').columns
predictions = clf.predict_proba(holdout[NUMERIC_COLUMNS_hd].fillna(-1000))

## Writing out your results to a csv for submission

In [None]:
# Format predictions in DataFrame: prediction_df
# pd.get_dummies(df[LABELS], ...) is correctly using df, not holdout
prediction_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS_td], prefix_sep='__').columns,
                             index=holdout.index,
                             data=predictions)

# Save prediction_df to csv
prediction_df.to_csv('data/2024-01-19_school_budgeting_with_maching_learning_in_python/predictions.csv')

In [None]:
# requires functions and variables from https://goodboychan.github.io/python/datacamp/machine_learning/2020/06/05/01-School-Budgeting-with-Machine-Learning-in-Python.html
# BOX_PLOTS_COLUMN_INDICES, def _multi_multi_log_loss, and def score_submissio

# Submit the predictions for scoring: score
score = score_submission(pred_path='data/2024-01-19_school_budgeting_with_maching_learning_in_python/predictions.csv', holdout_path='data/2024-01-19_school_budgeting_with_maching_learning_in_python/TestSetLabelsSample.csv')

# Print score
print('Your model, trained with numeric data only, yields logloss score: {}'.format(score))

**Even though your basic model scored 0.0 accuracy, it nevertheless performs better than the benchmark score of 2.0455. You've now got the basics down and have made a first pass at this complicated supervised learning problem. It's time to step up your game and incorporate the text data.**

## A very brief introduction to NLP

- Data for NLP:
  - Text, documents, speech, ...
- Tokenization
  - Splitting a string into segments
  - Store segments as list
- Example: "Natural Language Processing"
  - ['Natural', 'Language', 'Processing']

- Tokenize on whitespace
  - Petro-vend fuel and fluids
    - Petro-vend | fuel | and | fluids
- Tokenize on whitespace and punctuation
  - Petro | vend | fuel | and | fluids
 
- Bag of words representation
  - Count the umber of times a particular token appears
  - 'Bag of words'
    - Count the number of times a word was pulled out of the bag
- The approach discards information about word order
  - 'Red, not blue' is the same as 'blue, not red'
 
- 1-gram, 2-gram, ..., n-gram
  - ![][1]
 
[1]: https://raw.githubusercontent.com/trenton3983/DataCamp/master/Images/2024-01-19_school_budgeting_with_maching_learning_in_python/sb01.jpg

## Tokenizating text

- 6

## Testing your NLP credentials with n-grams

- 12 - The number of `1-grams + 2-grams + 3-grams` is `5 + 4 + 3 = 12`.

## Representing text numerically

**Scikit-learn tools for bag-of-words**

- `CountVectorizer()`
  - Tokenizes all the strings
  - Builds a 'vocabulary'
  - Counts the occurrences of each token in the vocabulary
 
**Using `CountVectorizer() on a column of main dataset**

```python
from sklearn.feature_extraction.text import CountVecotrizer

TOKENS_BASIC = '\\\\S+(?=\\\\s+)'
df.Program_Description.fillna('', inplace=True)
vec_basic = CountVectorizer(token_pattern=TOKENS_BASIC)

vec_basic.fit(df.Program_Description)
msg = 'There are {} token in Program_Desction if tokean are any non-whitespace'
print(msg.format(len(vec_basic.get_feature_names())))
```


## Creating a bag-of-words in scikit-learn

In this exercise, you'll study the effects of tokenizing in different ways by comparing the bag-of-words representations resulting from different token patterns.

You will focus on one feature only, the `Position_Extra` column, which describes any additional information not captured by the `Position_Type` label.

For example, in the Shell you can check out the budget item in row 8960 of the data using `df.loc[8960]`. Looking at the output reveals that this `Object_Description` is overtime pay. For who? The Position Type is merely "other", but the Position Extra elaborates: "BUS DRIVER". Explore the column further to see more instances. It has a lot of NaN values.

Your task is to turn the raw text in this column into a bag-of-words representation by creating tokens that contain only alphanumeric characters.

For comparison purposes, the first 15 tokens of `vec_basic`, which splits `df.Position_Extra` into tokens when it encounters only whitespace characters, have been printed along with the length of the representation.

**Instructions**

- Import `CountVectorizer` from `sklearn.feature_extraction.text`.
- Fill missing values in `df.Position_Extra` using `.fillna('')` to replace NaNs with empty strings. Specify the additional keyword argument `inplace=True` so that you don't have to assign the result back to `df`.
- Instantiate the `CountVectorizer` as `vec_alphanumeric` by specifying the `token_pattern` to be `TOKENS_ALPHANUMERIC`.
- Fit `vec_alphanumeric` to `df.Position_Extra`.
- Hit submit to see the `len` of the fitted representation as well as the first 15 elements, and compare to `vec_basic`.

In [None]:
# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Fill missing values in df.Position_Extra
df.Position_Extra.fillna('', inplace=True)

# Instantiate the CountVectorizer: vec_alphanumeric
vec_alphanumeric = CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC)

# Fit to the data
vec_alphanumeric.fit(df.Position_Extra)

# Print the number of tokens and first 15 tokens
msg = "There are {} tokens in Position_Extra if we split on non-alpha numeric"
print(msg.format(len(vec_alphanumeric.get_feature_names_out())))
print(vec_alphanumeric.get_feature_names_out()[:15])

**Treating only alpha-numeric characters as tokens gives you a smaller number of more meaningful tokens.**

## Combining text columns for tokenization

In order to get a bag-of-words representation for all the text data in our DataFrame, you must first convert the text data in each row of the DataFrame into a single string.

In the previous exercise, this wasn't necessary because you only looked at one column of data, so each row was already just a single string. `CountVectorizer` expects each row to just be a single string, so in order to use all of the text columns, you'll need a method to turn a list of strings into a single string.

In this exercise, you'll complete the function definition `combine_text_columns()`. When completed, this function will convert all training text data in your DataFrame to a single string per row that can be passed to the vectorizer object and made into a bag-of-words using the `.fit_transform()` method.

Note that the function uses `NUMERIC_COLUMNS` and `LABELS` to determine which columns to drop. These lists have been loaded into the workspace.

**Instructions**

- Use the `.drop()` method on `data_frame` with `to_drop` and `axis=` as arguments to drop the non-text data. Save the result as `text_data`.
- Fill in missing values (inplace) in `text_data` with blanks (""), using the `.fillna()` method.
- Complete the `.apply()` method by writing a lambda function that uses the `.join()` method to join all the items in a row with a space in between.

In [None]:
# Define combine_text_columns()
def combine_text_columns(data_frame: pd.DataFrame, to_drop: list=NUMERIC_COLUMNS_td + LABELS_td) -> pd.Series:
    """ converts all text in each row of data_frame to single vector """
    
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    
    # Replace nans with blanks
    text_data.fillna('', inplace=True)
    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

## What's in a token?

Now you will use `combine_text_columns` to convert all training text data in your DataFrame to a single vector that can be passed to the vectorizer object and made into a bag-of-words using the `.fit_transform()` method.

You'll compare the effect of tokenizing using any non-whitespace characters as a token and using only alphanumeric characters as a token.

**Instructions**

- Import `CountVectorizer` from `sklearn.feature_extraction.text`.
- Instantiate `vec_basic` and `vec_alphanumeric` using, respectively, the `TOKENS_BASIC` and `TOKENS_ALPHANUMERIC` patterns.
- Create the text vector by using the `combine_text_columns()` function on `df`.
- Using the `.fit_transform()` method with `text_vector`, fit and transform first `vec_basic` and then `vec_alphanumeric`. Print the number of tokens they contain.

In [None]:
# Create the basic token pattern
TOKENS_BASIC = '\\S+(?=\\s+)'

# Create the alphanumeric token pattern
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Instantiate basic CountVectorizer: vec_basic
vec_basic = CountVectorizer(token_pattern=TOKENS_BASIC)

# Instantiate alphanumeric CountVectorizer: vec_alphanumeric
vec_alphanumeric = CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC)

# Create the text vector
text_vector = combine_text_columns(df)

# Fit and transform vec_basic
vec_basic.fit_transform(text_vector)

# Print number of tokens of vec_basic
print("There are {} tokens in the dataset".format(len(vec_basic.get_feature_names_out())))

# Fit and transform vec_alphanumeric
vec_alphanumeric.fit_transform(text_vector)

# Print number of tokens of vec_alphanumeric
print("There are {} alpha-numeric tokens in the dataset".format(len(vec_alphanumeric.get_feature_names_out())))

**Notice that tokenizing on alpha-numeric tokens reduced the number of tokens, just as in the last exercise. We'll keep this in mind when building a better model with the Pipeline object next.**