# Warsztaty badawcze 2
## Projekt
### Andżelika Zalewska
12.05.2020

Cel projektu: Zbudowanie modelu predykcyjnego z wykorzystaniem poznanych technik text miningu w oparciu o zbiór treningowy złożony z 7 zmiennych objaśniających. Zbiór zawiera 10268 obserwacji.

In [65]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
import spacy
from sklearn.preprocessing import FunctionTransformer

In [2]:
D = pd.read_csv("train.tsv", sep="\t", header = 0)
print(D.head())
print(D.columns)
y = 1*(D.label == "pants-fire").values
X = D.fillna("")

         label                                          statement  \
0    half-true  When did the decline of coal start? It started...   
1  mostly-true  Hillary Clinton agrees with John McCain "by vo...   
2        false  Health care reform legislation is likely to ma...   
3    half-true  The economic turnaround started at the end of ...   
4         true  The Chicago Bears have had more starting quart...   

                              subject         speaker  \
0  energy,history,job-accomplishments  scott-surovell   
1                      foreign-policy    barack-obama   
2                         health-care    blog-posting   
3                        economy,jobs   charlie-crist   
4                           education       robin-vos   

                  speaker_job      state       party  \
0              State delegate   Virginia    democrat   
1                   President   Illinois    democrat   
2                         NaN        NaN        none   
3                 

In [3]:
d = pd.read_csv("test_noy.tsv", sep="\t", header = 0)
X_test = d.fillna("")

In [29]:
!python -m spacy download en_core_web_lg

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_lg')


In [7]:
!python -m spacy download en_core_web_md

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_md')


In [4]:
nlp2 = spacy.load("en_core_web_md")

In [5]:
nlp = spacy.load('en_core_web_lg')

In [6]:
X["statement_spacy"] = X.statement.apply(nlp)

In [66]:
X_test["statement_spacy"] = X_test.statement.apply(nlp)

In [8]:
#lematyzacja
def lemmatize(x):
  l = []
  for d in x:
    l.append(" ".join(t.lemma_ for t in d))
  return l
Lemmatizer = FunctionTransformer(lemmatize)

In [9]:
def text_to_vec(X):
    d1 = X.apply(nlp2, disable =["parser", "tagger", "ner"])
    return np.array([d.vector for d in d1])

doc2vec_transf = FunctionTransformer(text_to_vec)

In [10]:
def extract_text_features(s):
    s = s.astype('str')
    n = s.str.len().values
    n_w = s.str.split().str.len()
    avg_w_len = n.astype(float)/n_w
    return np.column_stack([n, n_w, avg_w_len])

In [11]:
def ner(x):
    l = []
    for el in x:
        l.append(" ".join(ent.label_ for ent in el.ents))
    return l
NamedEntityRecognition= FunctionTransformer(ner)   

In [12]:
def pos(x):
  l = []
  for d in x:
    l.append(" ".join(t.pos_ for t in d))
  return l
PartOfSpeechTagging = FunctionTransformer(pos)

In [13]:
def posdet(x):
  l = []
  for d in x:
    l.append(" ".join(t.tag_ for t in d))
  return l
PartOfSpeechTaggingDetailed = FunctionTransformer(posdet)

In [14]:
def avg_length(x):
  l = []
  for d in x:
    words = d.split()
    average = sum(len(word) for word in words) / len(words)
    l.append(average)
  return l

In [49]:
#średnia długość słowa w dokumencie
X["average"] = avg_length(X.statement)

In [50]:
X_test["average"] = avg_length(X_test.statement)

In [15]:
#ilość słów w dokumencie
def number_words(x):
  l = []
  for d in x:
    len_words = len(d.split())
    l.append(len_words)
  return l

In [51]:
X["number"] = number_words(X.statement)

In [52]:
X_test["number"] = number_words(X_test.statement)

In [58]:
ct = ColumnTransformer([("statement", make_pipeline(Lemmatizer,TfidfVectorizer(stop_words = "english")), "statement_spacy"),
                        ("statementPOS", make_pipeline(PartOfSpeechTagging,TfidfVectorizer(stop_words = "english")), "statement_spacy"),
                        ("statementPOSDetailed", make_pipeline(PartOfSpeechTaggingDetailed,TfidfVectorizer(stop_words = "english")), "statement_spacy"),
                        ("statementNER", make_pipeline(NamedEntityRecognition,TfidfVectorizer(stop_words = "english")), "statement_spacy"),
                        ("statement2vec", doc2vec_transf, "statement"),
                        ("statement_svd1", Pipeline([("TFIDF", TfidfVectorizer()), ("svd", TruncatedSVD(n_components = 1000, n_iter=10))]), "statement"),
                        ("statement_svd2", Pipeline([("TFIDF", TfidfVectorizer()), ("svd", TruncatedSVD(n_components = 500, n_iter=10))]), "statement"),
                        ("statement_svd3", Pipeline([("TFIDF", TfidfVectorizer()), ("svd", TruncatedSVD(n_components = 100, n_iter=10))]), "statement"),
                        ("funcTrans", FunctionTransformer(func=extract_text_features, validate = False, accept_sparse = True), "statement"), 
                        ("party", TfidfVectorizer(), "party"),
                        ("context", TfidfVectorizer(ngram_range =(1,2)), "context"),
                        ("speaker_job", HashingVectorizer(), "speaker_job"),
                        ("speaker", TfidfVectorizer(ngram_range =(1,2)), "speaker"), 
                        ("subject", TfidfVectorizer(), "subject"),
                        ("avg_length", "passthrough" , ["average"]),
                        ("number_words", "passthrough" , ["number"])
                       ])

In [59]:
#model wybrany za pomocą GridSearchCV
model = LogisticRegression(solver='newton-cg', penalty= "l2", C = 0.5, n_jobs=-1)
p= Pipeline([("ct", ct),("l",model)])

####  Jakość dopasowania modelu

In [62]:
score = cross_val_score(p,X,y,scoring="roc_auc")
score.mean()
#wynik na zbiorze treningowym
#0.7634088419984134

In [64]:
#dopasowanie modelu
p.fit(X,y)

## Scory na zbiorze testowym

In [56]:
score = p.decision_function(X_test)

In [63]:
score

array([-3.62755098, -2.02704756, -1.46937001, ..., -1.19378622,
       -2.16793641, -3.44960607])

In [121]:
y_test = p.predict_proba(X_test)

In [204]:
probab_test = y_test[:,1]

In [184]:
probab_test 

array([0.02660189, 0.12873048, 0.16606767, ..., 0.19926521, 0.09286273,
       0.04526488])

In [206]:
lst = list(score_test)

In [202]:
lst = np.array(lst)

In [208]:
with open('score_testowy_textmining', 'w') as f:
    for item in lst:
        f.write("%s\n" % item)