Replace unification-fd with a custom implementation of unification (#…

…1802) Closes #1661; towards #154. `unification-fd` is very powerful, and extremely fast, but it was written a long time ago and its age shows. It was not possible to incorporate it into our effects system in a nice way, necessitating the use of concrete monad transformers in the typechecking code. In addition it is impossible to customize, and we have been contemplating new type system features such as #153 and #154 that turn out to require hooking into the way the unification algorithm works (see #154 (comment) for more details). This PR thus removes the dependency on `unification-fd` and implements our own version of unification. It is not quite as fast as `unification-fd` but I consider the slowdown acceptable in order to gain e.g. recursive types. And of course there is also room to optimize it. The custom `UTerm` from `unification-fd` is replaced with the standard `Free` (free monad) construction from the `free` package, and the custom `Fix` from `unification-fd` is replaced with the one from `data-fix`. We also get rid of the `unifyCheck` function, which used to be a quick short-circuiting way to check whether two types definitely did not unify or might unify, allowing us to give better error messages more quickly. Now, the `=:=` unification operator itself just does this.
swarm-game · Apr 27, 2024 · 917ee5c · 917ee5c
1 parent d749c5e
commit 917ee5c
Show file tree

Hide file tree

Showing 12 changed files with 869 additions and 359 deletions.
diff --git a/.github/workflows/normalize-cabal.yml b/.github/workflows/normalize-cabal.yml
@@ -18,7 +18,7 @@ jobs:
     runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
-      - uses: tfausak/cabal-gild-setup-action@v1
+      - uses: tfausak/cabal-gild-setup-action@v2
         with:
-          version: 0.3.0.1
+          version: 1.3.0.1
       - run: cabal-gild --input swarm.cabal --mode check
diff --git a/src/swarm-lang/Swarm/Effect/Unify.hs b/src/swarm-lang/Swarm/Effect/Unify.hs
@@ -0,0 +1,53 @@
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE KindSignatures #-}
+
+-- |
+-- SPDX-License-Identifier: BSD-3-Clause
+-- Description: This module defines an effect signature for
+-- computations that support doing unification.  The intention is for
+-- code needing unification to use the operations defined in this
+-- module, and then import 'Swarm.Effect.Unify.Fast' to dispatch the
+-- 'Unification' effects.
+module Swarm.Effect.Unify where
+
+import Control.Algebra
+import Data.Kind (Type)
+import Data.Set (Set)
+import Swarm.Language.Types hiding (Type)
+
+-- | Data type representing available unification operations.
+data Unification (m :: Type -> Type) k where
+  Unify :: UType -> UType -> Unification m (Either UnificationError UType)
+  ApplyBindings :: UType -> Unification m UType
+  FreshIntVar :: Unification m IntVar
+  FreeUVars :: UType -> Unification m (Set IntVar)
+
+-- | Unify two types, returning a type equal to both, or a 'UnificationError' if
+--   the types definitely do not unify.
+(=:=) :: Has Unification sig m => UType -> UType -> m (Either UnificationError UType)
+t1 =:= t2 = send (Unify t1 t2)
+
+-- | Substitute for all the unification variables that are currently
+--   bound.  It is guaranteed that any unification variables remaining
+--   in the result are not currently bound, /i.e./ we have learned no
+--   information about them.
+applyBindings :: Has Unification sig m => UType -> m UType
+applyBindings = send . ApplyBindings
+
+-- | Compute the set of free unification variables of a type (after
+--   substituting away any which are already bound).
+freeUVars :: Has Unification sig m => UType -> m (Set IntVar)
+freeUVars = send . FreeUVars
+
+-- | Generate a fresh unification variable.
+freshIntVar :: Has Unification sig m => m IntVar
+freshIntVar = send FreshIntVar
+
+-- | An error that occurred while running the unifier.
+data UnificationError where
+  -- | Occurs check failure, i.e. the solution to some unification
+  --   equations was an infinite term.
+  Infinite :: IntVar -> UType -> UnificationError
+  -- | Mismatch error between the given terms.
+  UnifyErr :: TypeF UType -> TypeF UType -> UnificationError
+  deriving (Show)
diff --git a/src/swarm-lang/Swarm/Effect/Unify/Common.hs b/src/swarm-lang/Swarm/Effect/Unify/Common.hs
@@ -0,0 +1,60 @@
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE GeneralisedNewtypeDeriving #-}
+
+-- |
+-- SPDX-License-Identifier: BSD-3-Clause
+--
+-- Description: Common definitions used in both the naive and fast
+-- implementations of unification.
+module Swarm.Effect.Unify.Common where
+
+import Control.Algebra
+import Control.Effect.State (State, get)
+import Data.Map (Map)
+import Data.Map qualified as M
+import Data.Set (Set)
+import Prelude hiding (lookup)
+
+------------------------------------------------------------
+-- Substitutions
+
+-- | A value of type @Subst n a@ is a substitution which maps
+--   names of type @n@ (the /domain/, see 'dom') to values of type
+--   @a@.  Substitutions can be /applied/ to certain terms (see
+--   'subst'), replacing any free occurrences of names in the
+--   domain with their corresponding values.  Thus, substitutions can
+--   be thought of as functions of type @Term -> Term@ (for suitable
+--   @Term@s that contain names and values of the right type).
+--
+--   Concretely, substitutions are stored using a @Map@.
+newtype Subst n a = Subst {getSubst :: Map n a}
+  deriving (Eq, Ord, Show, Functor)
+
+-- | The domain of a substitution is the set of names for which the
+--   substitution is defined.
+dom :: Subst n a -> Set n
+dom = M.keysSet . getSubst
+
+-- | The identity substitution, /i.e./ the unique substitution with an
+--   empty domain, which acts as the identity function on terms.
+idS :: Subst n a
+idS = Subst M.empty
+
+-- | Construct a singleton substitution, which maps the given name to
+--   the given value.
+(|->) :: n -> a -> Subst n a
+x |-> t = Subst (M.singleton x t)
+
+-- | Insert a new name/value binding into the substitution.
+insert :: Ord n => n -> a -> Subst n a -> Subst n a
+insert n a (Subst m) = Subst (M.insert n a m)
+
+-- | Look up the value a particular name maps to under the given
+--   substitution; or return @Nothing@ if the name being looked up is
+--   not in the domain.
+lookup :: Ord n => n -> Subst n a -> Maybe a
+lookup x (Subst m) = M.lookup x m
+
+-- | Look up a name in a substitution stored in a state effect.
+lookupS :: (Ord n, Has (State (Subst n a)) sig m) => n -> m (Maybe a)
+lookupS x = lookup x <$> get
diff --git a/src/swarm-lang/Swarm/Effect/Unify/Fast.hs b/src/swarm-lang/Swarm/Effect/Unify/Fast.hs
@@ -0,0 +1,238 @@
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE UndecidableInstances #-}
+
+-- |
+-- SPDX-License-Identifier: BSD-3-Clause
+--
+-- Description: Fast yet purely functional implementation of
+-- unification, using a map as a lazy substitution, i.e. a
+-- manually-maintained "functional shared memory".
+--
+-- See Dijkstra, Middelkoop, & Swierstra, "Efficient Functional
+-- Unification and Substitution", Utrecht University tech report
+-- UU-CS-2008-027 (section 5) for the basic idea, and Peyton Jones et
+-- al, "Practical type inference for arbitrary-rank types"
+-- (pp. 74--75) for a correct implementation of unification via
+-- references.
+module Swarm.Effect.Unify.Fast where
+
+import Control.Algebra
+import Control.Applicative (Alternative)
+import Control.Carrier.State.Strict (StateC, evalState)
+import Control.Carrier.Throw.Either (ThrowC, runThrow)
+import Control.Category ((>>>))
+import Control.Effect.State (State, get, gets, modify)
+import Control.Effect.Throw (Throw, throwError)
+import Control.Monad.Free
+import Control.Monad.Trans (MonadIO)
+import Data.Function (on)
+import Data.Map qualified as M
+import Data.Map.Merge.Lazy qualified as M
+import Data.Set qualified as S
+import Swarm.Effect.Unify
+import Swarm.Effect.Unify.Common
+import Swarm.Language.Types hiding (Type)
+import Prelude hiding (lookup)
+
+------------------------------------------------------------
+-- Substitutions
+
+-- | Compose two substitutions.  Applying @s1 \@\@ s2@ is the same as
+--   applying first @s2@, then @s1@; that is, semantically,
+--   composition of substitutions corresponds exactly to function
+--   composition when they are considered as functions on terms.
+--
+--   As one would expect, composition is associative and has 'idS' as
+--   its identity.
+--
+--   Note that we do /not/ apply @s1@ to all the values in @s2@, since
+--   the substitution is maintained lazily; we do not need to maintain
+--   the invariant that values in the mapping do not contain any of
+--   the keys.  This makes composition much faster, at the cost of
+--   making application more complex.
+(@@) :: (Ord n, Substitutes n a a) => Subst n a -> Subst n a -> Subst n a
+(Subst s1) @@ (Subst s2) = Subst (s2 `M.union` s1)
+
+-- | Class of things supporting substitution.  @Substitutes n b a@ means
+--   that we can apply a substitution of type @Subst n b@ to a
+--   value of type @a@, replacing all the free names of type @n@
+--   inside the @a@ with values of type @b@, resulting in a new value
+--   of type @a@.
+--
+--   We also do a lazy occurs-check during substitution application,
+--   so we need the ability to throw a unification error.
+class Substitutes n b a where
+  subst :: Has (Throw UnificationError) sig m => Subst n b -> a -> m a
+
+-- | We can perform substitution on terms built up as the free monad
+--   over a structure functor @f@.
+instance Substitutes IntVar UType UType where
+  subst s = go S.empty
+   where
+    go seen (Pure x) = case lookup x s of
+      Nothing -> pure $ Pure x
+      Just t
+        | S.member x seen -> throwError $ Infinite x t
+        | otherwise -> go (S.insert x seen) t
+    go seen (Free t) = Free <$> goF seen t
+
+    goF _ t@(TyBaseF {}) = pure t
+    goF _ t@(TyVarF {}) = pure t
+    goF seen (TySumF t1 t2) = TySumF <$> go seen t1 <*> go seen t2
+    goF seen (TyProdF t1 t2) = TyProdF <$> go seen t1 <*> go seen t2
+    goF seen (TyRcdF m) = TyRcdF <$> mapM (go seen) m
+    goF seen (TyCmdF c) = TyCmdF <$> go seen c
+    goF seen (TyDelayF c) = TyDelayF <$> go seen c
+    goF seen (TyFunF t1 t2) = TyFunF <$> go seen t1 <*> go seen t2
+
+------------------------------------------------------------
+-- Carrier type
+
+-- Note: this carrier type and the runUnification function are
+-- identical between this module and Swarm.Effect.Unify.Naive, but it
+-- seemed best to duplicate it, so we can modify the carriers
+-- independently in the future if we want.
+
+-- | Carrier type for unification: we maintain a current substitution,
+--   a counter for generating fresh unification variables, and can
+--   throw unification errors.
+newtype UnificationC m a = UnificationC
+  { unUnificationC ::
+      StateC (Subst IntVar UType) (StateC FreshVarCounter (ThrowC UnificationError m)) a
+  }
+  deriving newtype (Functor, Applicative, Alternative, Monad, MonadIO)
+
+-- | Counter for generating fresh unification variables.
+newtype FreshVarCounter = FreshVarCounter {getFreshVarCounter :: Int}
+  deriving (Eq, Ord, Enum)
+
+-- | Run a 'Unification' effect via the 'UnificationC' carrier.
+runUnification :: Algebra sig m => UnificationC m a -> m (Either UnificationError a)
+runUnification =
+  unUnificationC >>> evalState idS >>> evalState (FreshVarCounter 0) >>> runThrow
+
+------------------------------------------------------------
+-- Unification
+
+-- The idea here (using an explicit substitution as a sort of
+-- "functional shared memory", instead of directly using IORefs), is
+-- based on Dijkstra et al. Unfortunately, their implementation of
+-- unification is subtly wrong; fortunately, a single integration test
+-- in the Swarm test suite failed, leading to discovering the bug.
+-- The basic issue is that when unifying an equation between two
+-- variables @x = y@, we must look up *both* to see whether they are
+-- already mapped by the substitution (and if so, replace them by
+-- their referent and keep recursing).  Dijkstra et al. only look up
+-- @x@ and simply map @x |-> y@ if x is not in the substitution, but
+-- this can lead to cycles where e.g. x is mapped to y, and later we
+-- unify @y = x@ resulting in both @x |-> y@ and @y |-> x@ in the
+-- substitution, which at best leads to a spurious infinite type
+-- error, and at worst leads to infinite recursion in the unify function.
+--
+-- Peyton Jones et al. show how to do it correctly: when unifying x = y and
+-- x is not mapped in the substitution, we must also look up y.
+
+-- | Implementation of the 'Unification' effect in terms of the
+--   'UnificationC' carrier.
+instance Algebra sig m => Algebra (Unification :+: sig) (UnificationC m) where
+  alg hdl sig ctx = UnificationC $ case sig of
+    L (Unify t1 t2) -> (<$ ctx) <$> runThrow (unify t1 t2)
+    L (ApplyBindings t) -> do
+      s <- get @(Subst IntVar UType)
+      (<$ ctx) <$> subst s t
+    L FreshIntVar -> do
+      v <- IntVar <$> gets getFreshVarCounter
+      modify @FreshVarCounter succ
+      return $ v <$ ctx
+    L (FreeUVars t) -> do
+      s <- get @(Subst IntVar UType)
+      (<$ ctx) . fuvs <$> subst s t
+    R other -> alg (unUnificationC . hdl) (R (R (R other))) ctx
+
+-- | Unify two types, returning a unified type equal to both.  Note
+--   that for efficiency we /don't/ do an occurs check here, but
+--   instead lazily during substitution.
+unify ::
+  ( Has (Throw UnificationError) sig m
+  , Has (State (Subst IntVar UType)) sig m
+  ) =>
+  UType ->
+  UType ->
+  m UType
+unify ty1 ty2 = case (ty1, ty2) of
+  (Pure x, Pure y) | x == y -> pure (Pure x)
+  (Pure x, y) -> do
+    mxv <- lookupS x
+    case mxv of
+      Nothing -> unifyVar x y
+      Just xv -> unify xv y
+  (x, Pure y) -> unify (Pure y) x
+  (Free t1, Free t2) -> Free <$> unifyF t1 t2
+
+-- | Unify a unification variable which /is not/ bound by the current
+--   substitution with another term.  If the other term is also a
+--   variable, we must look it up as well to see if it is bound.
+unifyVar ::
+  ( Has (Throw UnificationError) sig m
+  , Has (State (Subst IntVar UType)) sig m
+  ) =>
+  IntVar ->
+  UType ->
+  m UType
+unifyVar x (Pure y) = do
+  myv <- lookupS y
+  case myv of
+    -- x = y but the variable y is not bound: just add (x |-> y) to
+    -- the current Subst
+    --
+    -- Note, as an optimization we just call e.g. insert x (Pure y)
+    -- instead of building a singleton Subst with @(|->)@ and then
+    -- composing, since composition doesn't need to apply the newly
+    -- created binding to all the other values bound in the Subst.
+    Nothing -> modify @(Subst IntVar UType) (insert x (Pure y)) >> pure (Pure y)
+    -- x = y  and y is bound to v: recurse on x = v.
+    Just yv -> unify (Pure x) yv
+
+-- x = t for a non-variable t: just add (x |-> t) to the Subst.
+unifyVar x t = modify (insert x t) >> pure t
+
+-- | Perform unification on two non-variable terms: check that they
+--   have the same top-level constructor and recurse on their
+--   contents.
+unifyF ::
+  ( Has (Throw UnificationError) sig m
+  , Has (State (Subst IntVar UType)) sig m
+  ) =>
+  TypeF UType ->
+  TypeF UType ->
+  m (TypeF UType)
+unifyF t1 t2 = case (t1, t2) of
+  (TyBaseF b1, TyBaseF b2) -> case b1 == b2 of
+    True -> pure t1
+    False -> unifyErr
+  (TyBaseF {}, _) -> unifyErr
+  -- Note that *type variables* are not the same as *unification variables*.
+  -- Type variables must match exactly.
+  (TyVarF v1, TyVarF v2) -> case v1 == v2 of
+    True -> pure t1
+    False -> unifyErr
+  (TyVarF {}, _) -> unifyErr
+  (TySumF t11 t12, TySumF t21 t22) -> TySumF <$> unify t11 t21 <*> unify t12 t22
+  (TySumF {}, _) -> unifyErr
+  (TyProdF t11 t12, TyProdF t21 t22) -> TyProdF <$> unify t11 t21 <*> unify t12 t22
+  (TyProdF {}, _) -> unifyErr
+  (TyRcdF m1, TyRcdF m2) ->
+    case ((==) `on` M.keysSet) m1 m2 of
+      False -> unifyErr
+      _ -> fmap TyRcdF . sequence $ M.merge M.dropMissing M.dropMissing (M.zipWithMatched (const unify)) m1 m2
+  (TyRcdF {}, _) -> unifyErr
+  (TyCmdF c1, TyCmdF c2) -> TyCmdF <$> unify c1 c2
+  (TyCmdF {}, _) -> unifyErr
+  (TyDelayF c1, TyDelayF c2) -> TyDelayF <$> unify c1 c2
+  (TyDelayF {}, _) -> unifyErr
+  (TyFunF t11 t12, TyFunF t21 t22) -> TyFunF <$> unify t11 t21 <*> unify t12 t22
+  (TyFunF {}, _) -> unifyErr
+ where
+  unifyErr = throwError $ UnifyErr t1 t2