# Скачаем датасет данных с Twitter Sentiment

In [10]:
! echo "Hello, notebooks! :)"
! mkdir -p /home/jovyan/data
! mkdir -p /home/jovyan/models

Hello, notebooks! :)


In [None]:
! wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip -O /home/jovyan/data/sentiment.zip

In [None]:
! cd /home/jovyan/data && unzip sentiment.zip

In [None]:
! ls -la /home/jovyan/data

In [None]:
! head -n 5 /home/jovyan/data/training.1600000.processed.noemoticon.csv

# Читаем датасет с помощью Spark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('twitter-sentiment').getOrCreate()

print("Spark context started")

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, StringType

schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("id", LongType(), True),
    StructField("raw_timestamp", StringType(), True),
    StructField("query_status", StringType(), True),
    StructField("author", StringType(), True),
    StructField("tweet", StringType(), True)
])
    
data_path = "/home/jovyan/data/training.1600000.processed.noemoticon.csv"

raw_sentiment = spark.read.csv(data_path,header=False,schema=schema) \
    .selectExpr("(case when target=4 then 1 else 0 end) as target","tweet")



raw_sentiment.groupBy("target").count().show()

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
raw_sentiment_sample = raw_sentiment.sample(fraction=0.005,withReplacement=False,seed=42).toPandas()
X, y = raw_sentiment_sample["tweet"], raw_sentiment_sample["target"]

print("Dataset size is: %i" % X.size)


In [None]:
from sklearn.model_selection import GridSearchCV
from time import time
from pprint import pprint

from sentiment_model import pipeline, parameters

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, cv=3,
                               n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X, y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
import pickle as pkl

def save_model(model,model_path):
    with open(model_path,'wb') as buffer:
        pkl.dump(model,buffer)

def read_model(model_path):
    with open(model_path,'rb') as buffer:
        return pkl.load(buffer)

model_path = "/home/jovyan/tweet_sentiment.mdl"
save_model(grid_search.best_estimator_,model_path)

In [None]:
model_object = read_model(model_path)
model_object

In [None]:
%matplotlib inline
import pandas as pd

pd.Series(model_object.predict_proba(X)[:,1]).hist(figsize=(20,10))

In [None]:
spark.stop()