# Extended Process

- Obtaining and loading data: from CSV
- Exploring the data: visualise frequencies and show distributions
- Machine learning
    - Clean data
    - Sampling
    - Split data
    - Pipeline: vectorisation and model fiting
    - Model evaluation: more detailed reporting
- Apply model (do one prediction)

### Obtaining data

In [None]:
import pandas as pd

df = pd.read_csv("data/mental_health.csv")
df.head()

### Exploration

In [None]:
import seaborn as sns

value_counts = df.label.value_counts()
sns.barplot(y=value_counts.index, x=value_counts, orient="h")

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
df["label"].reset_index().groupby("label").count().plot(
        kind="barh", legend=False, ax=ax
        ).grid(axis='x')
plt.show()

In [None]:
# Word Count
df["word_count"] = df["text"].apply(
    lambda x:
        len(str(x).split())
)
print(df[df["label"] == 0]["word_count"].mean())
print(df[df["label"] == 1]["word_count"].mean())

In [None]:
import matplotlib.pyplot as plt

# Word Count Histogram

fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,4))
train_words=df[df['label']==0]['word_count']
ax1.hist(train_words,color='green')
ax1.set_title('regular text')
train_words=df[df['label']==1]['word_count']
ax2.hist(train_words,color='red')
ax2.set_title('poisonous text')
fig.suptitle('Words per text')
plt.show()

### Sampling (e.g. upon class imbalance)

In [None]:
# simple, e.g. to speed things up while developing

df = df.sample(n=25000, random_state = 1)

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy=1, random_state=17)
df_balanced, _ = rus.fit_resample(df, df['label'])
df_balanced.head()

### Cleaning

In [None]:
import re
import nltk
# nltk.download("stopwords")
# nltk.download("wordnet")

def clean(text, stopwords):
    text = text.lower()
    text = text.strip()
    text = re.sub(r'[^\w\s]', '', text)

    text_list = text.split()
    text_list = [word for word in text_list if word not in stopwords]

    lematizer = nltk.stem.wordnet.WordNetLemmatizer()
    text_list = [lematizer.lemmatize(word) for word in text_list]

    text = " ".join(text_list)
    return text

stopwords = nltk.corpus.stopwords.words("english")
df["text_clean"] = df["text"].apply(
    lambda x:
        clean(x, stopwords)
)

df.head()

### Modelling

In [None]:
from sklearn.model_selection import train_test_split

X = df["text_clean"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=17)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline([("tfidf", TfidfVectorizer()), ("svm", MultinomialNB())])
pipe.fit(X_train, y_train)

### Evaluation

In [None]:
from sklearn import metrics

predictions = pipe.predict(X_test)
report = metrics.classification_report(y_true=y_test, y_pred=predictions)
print(report)

In [None]:
confusion = metrics.confusion_matrix(y_true=y_test, y_pred=predictions)

fig, ax = plt.subplots(figsize=(6, 6))
ax.matshow(confusion, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confusion.shape[0]):
    for j in range(confusion.shape[1]):
        ax.text(
            x=j, y=i,s=confusion[i, j], 
            va='center', ha='center', size='xx-large')
 
plt.xlabel('Prediction', fontsize=18)
plt.ylabel('Reality', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

### Application

In [None]:
a = pipe.predict(['''
    nothing look forward lifei dont many 
    reasons keep going feel like nothing 
    keeps going next day makes want hang myself
    '''])
print(a)